changeset 1:1c4b24e9bb16 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author mbernt
date Mon, 16 Jul 2018 11:01:52 -0400
parents ec898924d8c7
children 4952f1ece60b
files getLongestORF.py test-data/test_input.fasta test-data/test_output.fasta test-data/test_output.tab
diffstat 4 files changed, 248 insertions(+), 293 deletions(-) [+]
line wrap: on
line diff
--- a/getLongestORF.py	Wed Jun 20 11:02:06 2018 -0400
+++ b/getLongestORF.py	Mon Jul 16 11:01:52 2018 -0400
@@ -1,114 +1,108 @@
 #!/usr/bin/env python
 
-"""
-usage: getLongestORF.py input output.fas output.tab
-
-
-input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf")
-output.fas: fasta file with all longest ORFs per transcript
-output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
-
-example:
+#example:
+#>STRG.1.1(-)_1 [10 - 69]
+#GGNHHTLGGKKTFSYTHPPC
+#>STRG.1.1(-)_2 [3 - 80]
+#FLRGEPPHIGGKKDIFLHPPTLLKGR
 
->253936-254394(+)_1 [28 - 63] 
-LTNYCQMVHNIL
->253936-254394(+)_2 [18 - 77] 
-HKLIDKLLPNGAQYFVKSTQ
->253936-254394(+)_3 [32 - 148] 
-QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
->253936-254394(+)_4 [117 - 152] 
-LGIIVPSLLLCN
->248351-252461(+)_1 [14 - 85] 
-VLARKYPRCLSPSKKSPCQLRQRS
->248351-252461(+)_2 [21 - 161] 
-PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
->248351-252461(+)_3 [89 - 202] 
-VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
->248351-252461(+)_4 [206 - 259] 
-SRTTSGRQSVLSEKLWRR
->248351-252461(+)_5 [263 - 313] 
-CLSPLWVPCCSRHSCHG
-"""
+#output1: fasta file with all longest ORFs per transcript
+#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs
 
-import sys,re
+import sys,re;
 
 def findlongestOrf(transcriptDict,old_seqID):
-    #write for previous seqID
-    prevTranscript = transcriptDict[old_seqID]
-    i_max = 0
-    #find longest orf in transcript
-    for i in range(0,len(prevTranscript)):
-        if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
-            i_max = i
-    for i in range(0,len(prevTranscript)):
-        prevStart = prevTranscript[i][0]
-        prevEnd = prevTranscript[i][1]
-        prevLength = prevTranscript[i][2]
-        output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
-        if (end - start > 0):
-            output+="\tForward"
-        else:
-            output+="\tReverse"
-        if(i == i_max):
-            output += "\ty\n"
-        else:
-            output += "\tn\n"
-        OUTPUT_ORF_SUMMARY.write(output)
-    transcriptDict.pop(old_seqID, None)
-    return None
+	#write for previous seqID
+	prevTranscript = transcriptDict[old_seqID];
+	i_max = 0;
+	transcript = old_seqID.split("(")[0]
+
+	#find longest orf in transcript
+	for i in range(0,len(prevTranscript)):
+		if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
+			i_max = i;
 
-INPUT = open(sys.argv[1],"r")
-OUTPUT_FASTA = open(sys.argv[2],"w")
-OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w")
+	for i in range(0,len(prevTranscript)):
+		prevORFstart = prevTranscript[i][0];
+		prevORFend = prevTranscript[i][1];
+		prevORFlength = prevTranscript[i][2];
+		header = prevTranscript[i][3];
+		strand = re.search('\(([+-]+)\)',header).group(1);
+		
+		output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand);
+		if (prevORFend - prevORFstart > 0):
+			output+="\tnormal";
+		else:
+			output+="\treverse_sense";
+		if(i == i_max):
+			output += "\ty\n";
+		else:
+			output += "\tn\n";
+
+		OUTPUT_ORF_SUMMARY.write(output);
 
-seqID = ""
-old_seqID = ""
-lengthDict = {}
-seqDict = {}
-headerDict = {}
-transcriptDict = {}
-skip = False
+	transcriptDict.pop(old_seqID, None);
+	return None;
+
+#-----------------------------------------------------------------------------------------------------
+
+INPUT = open(sys.argv[1],"r");
+OUTPUT_FASTA = open(sys.argv[2],"w");
+OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w");
 
-OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n")
+seqID = "";
+old_seqID = "";
+lengthDict = {};
+seqDict = {};
+headerDict = {};
+transcriptDict = {};
+
+skip = False;
+
+OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n");
 
 for line in INPUT:
-    line = line.strip()
-#    print line
-    if(re.match(">",line)): #header
-        seqID = "_".join(line.split(">")[1].split("_")[:-1])
-        #seqID = line.split(">")[1].split("_")[0]
-        start = int (re.search('\ \[(\d+)\ -', line).group(1))
-        end = int (re.search('-\ (\d+)\]',line).group(1))
-        length = abs(end - start)
-        if(seqID not in transcriptDict and old_seqID != ""): #new transcript
-            findlongestOrf(transcriptDict,old_seqID)
-        if seqID not in transcriptDict:
-            transcriptDict[seqID] = []
-        transcriptDict[seqID].append([start,end,length])
-        if(seqID not in lengthDict and old_seqID != ""): #new transcript
-            #write FASTA
-            OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
-            #delete old dict entry
-            headerDict.pop(old_seqID, None)
-            seqDict.pop(old_seqID, None)
-            lengthDict.pop(old_seqID, None)
-        #if several longest sequences exist with the same length, the dictionary saves the last occuring.
-        if(seqID not in lengthDict or length >= lengthDict[seqID]):
-            headerDict[seqID] = line
-            lengthDict[seqID] = length
-            seqDict[seqID] = ""
-            skip = False
-        else:
-            skip = True
-            next
-        old_seqID = seqID
-    elif(skip):
-        next
-    else:
-        seqDict[seqID] += line
+	line = line.strip();
+	if(re.match(">",line)): #header
+		header = line.split(">")[1].split(" ")[0]
+		seqID = "_".join(line.split(">")[1].split("_")[:-1])
+		ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1));
+		ORFend = int (re.search('-\ (\d+)\]',line).group(1));
+		length = abs(ORFend - ORFstart);
+
+		if(seqID not in transcriptDict and old_seqID != ""): #new transcript
+			findlongestOrf(transcriptDict,old_seqID);
+			
+		if seqID not in transcriptDict:
+			transcriptDict[seqID] = [];
+
+		transcriptDict[seqID].append([ORFstart,ORFend,length,header]);
 
-OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID])
-findlongestOrf(transcriptDict,old_seqID)
-INPUT.close()
-OUTPUT_FASTA.close()
-OUTPUT_ORF_SUMMARY.close()
+		if(seqID not in lengthDict and old_seqID != ""): #new transcript
+			#write FASTA
+			OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n");
+			#delete old dict entry
+			headerDict.pop(old_seqID, None);
+			seqDict.pop(old_seqID, None);
+			lengthDict.pop(old_seqID, None);
+		#if several longest sequences exist with the same length, the dictionary saves the last occuring.
+		if(seqID not in lengthDict or length >= lengthDict[seqID]):
+			headerDict[seqID] = line;
+			lengthDict[seqID] = length;
+			seqDict[seqID] = "";
+			skip = False;
+		else:
+			skip = True;
+			next;
+		old_seqID = seqID;
+	elif(skip):
+		next;
+	else:
+		seqDict[seqID] += line;
+
+OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]);
+findlongestOrf(transcriptDict,old_seqID);
+
+INPUT.close();
+OUTPUT_FASTA.close();
+OUTPUT_ORF_SUMMARY.close();
\ No newline at end of file
--- a/test-data/test_input.fasta	Wed Jun 20 11:02:06 2018 -0400
+++ b/test-data/test_input.fasta	Mon Jul 16 11:01:52 2018 -0400
@@ -1,127 +1,100 @@
->14520830-14521117(-)_1 [2 - 37] 
-KPLENISASREF
->14520830-14521117(-)_2 [3 - 47] 
-SPWRIFQPAENFDLQ
->14520830-14521117(-)_3 [41 - 94] 
-LAVGFGLIFLRSGWMPCL
->14520830-14521117(-)_4 [63 - 152] 
-FSYDLGGCLACDSCSSYSPNEGQCPARKLE
->14520830-14521117(-)_5 [146 - 175] 
-VGMMDLCSET
->14520830-14521117(-)_6 [156 - 200] 
-WTCVQRLNRTNKQNK
->14520830-14521117(-)_7 [1 - 240] 
-KAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSAPHVSWNDGPVFRDLT
-EPTSKTSENRKKEEDTGINS
->14520830-14521117(-)_8 [179 - 325] 
-QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRGTKPLENISASREF
->14520830-14521117(-)_9 [204 - 335] 
-EQEKGRGHWNQFLRVALLGFSSIPSFVGQSPWRIFQPAENFDLQ
->14520830-14521117(-)_10 [329 - 382] 
-LAVGFGLIFLRSGWMPCL
->14520830-14521117(-)_11 [351 - 440] 
-FSYDLGGCLACDSCSSYSPNEGQCPARKLE
->14520830-14521117(-)_12 [434 - 463] 
-VGMMDLCSET
->14520830-14521117(-)_13 [444 - 488] 
-WTCVQRLNRTNKQNK
->14520830-14521117(-)_14 [244 - 528] 
-ELHYLVFLPFQVSWDKAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSA
-PHVSWNDGPVFRDLTEPTSKTSENRKKEEDTGINS
->14520830-14521117(-)_15 [467 - 574] 
-QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRG
->14520830-14521117(-)_16 [492 - 575] 
-EQEKGRGHWNQFLRVALLGFSSIPSFVG
->14520830-14521117(-)_17 [532 - 576] 
-ELHYLVFLPFQVSWD
->14520830-14521117(-)_18 [575 - 543] (REVERSE SENSE) 
-SHETWNGRKTK
->14520830-14521117(-)_19 [574 - 524] (REVERSE SENSE) 
-PTKLGMEEKPSSATLKN
->14520830-14521117(-)_20 [576 - 466] (REVERSE SENSE) 
-VPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC
->14520830-14521117(-)_21 [520 - 458] (REVERSE SENSE) 
-FQCPLPFSCSHLFCLLVLLSL
->14520830-14521117(-)_22 [454 - 401] (REVERSE SENSE) 
-TQVHHSNLRAGHCPSFGE
->14520830-14521117(-)_23 [397 - 359] (REVERSE SENSE) 
-ELQLSQARHPPRS
->14520830-14521117(-)_24 [355 - 311] (REVERSE SENSE) 
-ENQTKSYCKSKFSAG
->14520830-14521117(-)_25 [539 - 255] (REVERSE SENSE) 
-CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI
-VGKSNQILLQVKILCWLKYSPGALSHETWNGRKTK
->14520830-14521117(-)_26 [307 - 236] (REVERSE SENSE) 
-NILQGLCPTKLGMEEKPSSATLKN
->14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE) 
-VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVP
-RNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC
->14520830-14521117(-)_28 [232 - 170] (REVERSE SENSE) 
-FQCPLPFSCSHLFCLLVLLSL
->14520830-14521117(-)_29 [166 - 113] (REVERSE SENSE) 
-TQVHHSNLRAGHCPSFGE
->14520830-14521117(-)_30 [109 - 71] (REVERSE SENSE) 
-ELQLSQARHPPRS
->14520830-14521117(-)_31 [67 - 23] (REVERSE SENSE) 
-ENQTKSYCKSKFSAG
->14520830-14521117(-)_32 [251 - 3] (REVERSE SENSE) 
-CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI
-VGKSNQILLQVKILCWLKYSPGA
->14520830-14521117(-)_33 [174 - 1] (REVERSE SENSE) 
-VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGF
->103089310-103089560(-)_1 [2 - 37] 
-GTSEKFLKILLS
->103089310-103089560(-)_2 [24 - 92] 
-RFYYHRYLFWFCVSVLSADGPKL
->103089310-103089560(-)_3 [13 - 117] 
-KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK
->103089310-103089560(-)_4 [138 - 167] 
-TACIWLHCGL
->103089310-103089560(-)_5 [180 - 260] 
-NHPYVSVSGYTRKRKESQSGTKSGRYV
->103089310-103089560(-)_6 [41 - 271] 
-ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK
-ERVSEWNKEWEVRLKSF
->103089310-103089560(-)_7 [127 - 288] 
-NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRVGGTSEKFLKILLS
->103089310-103089560(-)_8 [275 - 343] 
-RFYYHRYLFWFCVSVLSADGPKL
->103089310-103089560(-)_9 [264 - 368] 
-KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK
->103089310-103089560(-)_10 [389 - 418] 
-TACIWLHCGL
->103089310-103089560(-)_11 [378 - 500] 
-NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRV
->103089310-103089560(-)_12 [292 - 501] 
-ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK
-ERVSEWNKEW
->103089310-103089560(-)_13 [431 - 502] 
-NHPYVSVSGYTRKRKESQSGTKSG
->103089310-103089560(-)_14 [500 - 447] (REVERSE SENSE) 
-HSLFHSETLSFSSCIQIH
->103089310-103089560(-)_15 [480 - 436] (REVERSE SENSE) 
-DSFLFLVYPDTLTYG
->103089310-103089560(-)_16 [502 - 383] (REVERSE SENSE) 
-PTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA
->103089310-103089560(-)_17 [426 - 361] (REVERSE SENSE) 
-KCHKPQWSHIHAVQPDFTLIFF
->103089310-103089560(-)_18 [357 - 289] (REVERSE SENSE) 
-YRLHYNFGPSAESTDTQNQNKYL
->103089310-103089560(-)_19 [379 - 233] (REVERSE SENSE) 
-FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTYLPLFVPL
->103089310-103089560(-)_20 [279 - 196] (REVERSE SENSE) 
-NLQKLFRRTSHSLFHSETLSFSSCIQIH
->103089310-103089560(-)_21 [229 - 185] (REVERSE SENSE) 
-DSFLFLVYPDTLTYG
->103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE) 
-HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF
-SDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA
->103089310-103089560(-)_23 [175 - 110] (REVERSE SENSE) 
-KCHKPQWSHIHAVQPDFTLIFF
->103089310-103089560(-)_24 [106 - 38] (REVERSE SENSE) 
-YRLHYNFGPSAESTDTQNQNKYL
->103089310-103089560(-)_25 [128 - 3] (REVERSE SENSE) 
-FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTY
->103089310-103089560(-)_26 [192 - 1] (REVERSE SENSE) 
-HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF
-SDVP
+>STRG.4.1(-)_1 [3 - 77] 
+PNHCLRGHESPETRQSPLSGKRIPS
+>STRG.4.1(-)_2 [59 - 88] 
+WKEDPLIVCP
+>STRG.4.1(-)_3 [92 - 127] 
+WNRERYFQGFGL
+>STRG.4.1(-)_4 [131 - 268] 
+LPTQKQKDRWGTHTLERFGFTVPTMPAVISLFTETNPSSQITSTQD
+>STRG.4.1(-)_5 [81 - 332] 
+SAPDGIGKDTSRGSDYNCQPRNKRTAGGLTPWRDLGSQCPQCQRLFHYLRRRIPPVRSLQ
+LKTKFWRHPDHLGTHRLLGVPAEN
+>STRG.4.1(-)_6 [272 - 379] 
+VLETPRPSGHAQATWGSCGELSSQMQMESAMMPSTW
+>STRG.4.1(-)_7 [366 - 437] 
+CPLRGRGTGRSHGAAGHAVPPNRH
+>STRG.4.1(-)_8 [465 - 518] 
+KRYGMCCVYLDKFVGGCG
+>STRG.4.1(-)_9 [383 - 565] 
+GHWTKPWSCGACGTTQSTLRPRSSEPIKALRNVLCLPGQICWWLWLSWMGRNKKPWTWFP
+W
+>STRG.4.1(-)_10 [599 - 664] 
+SSEWWSWWTSASSPSTPVGRSS
+>STRG.4.1(-)_11 [522 - 665] 
+AGWVGTRSLGPGSLGDQRGPGGALPDRRSGGRGGHRRHPHQLPWGEAA
+>STRG.4.1(-)_12 [1 - 666] 
+DPTTVYVDMRALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDSHLGE
+IWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLGFLRRTELTDANGER
+HDALYVVGALDEAMELRGMRYHPIDIETSVIRAHKSVTECAVFTWTNLLVVVVELDGSEQ
+EALDLVPLVTNVVLEEHYLIVGVVVVVDIGVIPINSRGEKQR
+>STRG.4.1(-)_13 [665 - 594] (REVERSE SENSE) 
+RCFSPRELMGMTPMSTTTTTPTIR
+>STRG.4.1(-)_14 [631 - 491] (REVERSE SENSE) 
+RRCPPRPPLRRSGSAPPGPRWSPREPGPRLLVPTHPAQPQPPTNLSR
+>STRG.4.1(-)_15 [590 - 375] (REVERSE SENSE) 
+CSSRTTLVTKGTRSKASCSDPSSSTTTTNKFVQVNTAHSVTLLWALMTEVSMSIGWYRMP
+RSSMASSSAPTT
+>STRG.4.1(-)_16 [448 - 341] (REVERSE SENSE) 
+PRSQCRLGGTACPAAPWLRPVPLPRRGHHGALHLHL
+>STRG.4.1(-)_17 [371 - 312] (REVERSE SENSE) 
+RASWRSPFASVSSVLRRNPK
+>STRG.4.1(-)_18 [666 - 271] (REVERSE SENSE) 
+ALLLPTGVDGDDADVHHDHHSDDQVVLLQDHVGHQGNQVQGFLFRPIQLNHNHQQICPGK
+HSTFRNAFMGSDDRGLNVDWVVPHAPQLHGFVQCPYHVEGIMALSICICELSSPQEPQVA
+CACPDGLGVSKT
+>STRG.4.1(-)_19 [337 - 260] (REVERSE SENSE) 
+AQFSAGTPSSLCVPRWSGCLQNLVLS
+>STRG.4.1(-)_20 [308 - 255] (REVERSE SENSE) 
+PVRAQMVWVSPKLSLELK
+>STRG.4.1(-)_21 [256 - 224] (REVERSE SENSE) 
+SDLTGGIRLRK
+>STRG.4.1(-)_22 [246 - 169] (REVERSE SENSE) 
+LEGFVSVNSEITAGIVGTVNPNLSKV
+>STRG.4.1(-)_23 [217 - 128] (REVERSE SENSE) 
+NNRWHCGHCEPKSLQGVSPPAVLLFLGWQL
+>STRG.4.1(-)_24 [188 - 78] (REVERSE SENSE) 
+TQISPRCESPSGPFVSGLAIIIRTPGSIFPDSIRGRL
+>STRG.4.1(-)_25 [165 - 58] (REVERSE SENSE) 
+VPQRSFCFWVGNYNPNPWKYLSRFHQGQTMRGSSFH
+>STRG.4.1(-)_26 [74 - 18] (REVERSE SENSE) 
+GDPLSTKRTLSCLRALMST
+>STRG.4.1(-)_27 [124 - 2] (REVERSE SENSE) 
+SEPLEVSFPIPSGADYEGILFPLSGLCRVSGLSCPRRQWLG
+>STRG.4.1(-)_28 [54 - 1] (REVERSE SENSE) 
+ADSVVSQGSHVHVDSGWV
+>STRG.6.1(-)_1 [1 - 63] 
+NWDASWRKDVSRSHQCLLPFH
+>STRG.6.1(-)_2 [24 - 182] 
+RCLTQPPVPSAVPLSCSVNFTPLEKWPSAWTLTVDWDLSSGASAVCILGTSPS
+>STRG.6.1(-)_3 [94 - 195] 
+RSGHLPGPLLWTGICPLVPLQCVFWAPVHPDPAL
+>STRG.6.1(-)_4 [186 - 233] 
+SRPLSWKPTPPCGFLP
+>STRG.6.1(-)_5 [2 - 250] 
+TGMLAGVKMSHAATSAFCRSIKLQCELYPSREVAICLDPYCGLGFVLWCLCSVYSGHQSI
+LIPPSELETNPALWLLAVSQYKV
+>STRG.6.1(-)_6 [199 - 252] 
+AGNQPRLVASCRESVQSP
+>STRG.6.1(-)_7 [237 - 121] (REVERSE SENSE) 
+LTARSHKAGLVSSSEGGIRMDWCPEYTLQRHQRTNPSPQ
+>STRG.6.1(-)_8 [251 - 93] (REVERSE SENSE) 
+GLCTDSRQEATRRGWFPAQRAGSGWTGAQNTHCRGTRGQIPVHSKGPGRWPLL
+>STRG.6.1(-)_9 [117 - 85] (REVERSE SENSE) 
+GSRQMATSLEG
+>STRG.6.1(-)_10 [81 - 34] (REVERSE SENSE) 
+SSHCSLMERQKALVAA
+>STRG.6.1(-)_11 [250 - 14] (REVERSE SENSE) 
+DFVLTHGKKPQGGVGFQLRGRDQDGLVPRIHTAEAPEDKSQSTVRVQADGHFSRGVKFTL
+QLNGTAEGTGGCVRHLYAS
+>STRG.6.1(-)_12 [62 - 3] (REVERSE SENSE) 
+WNGRRHWWLRETSLRQLASQ
+>STRG.6.1(-)_13 [30 - 1] (REVERSE SENSE) 
+DIFTPASIPV
+>STRG.8.1(-)_1 [18 - 56] 
+RTSKKPNGRDPTV
+>STRG.8.1(-)_2 [60 - 95] 
+RLAKAAVVCHRV
+>STRG.8.1(-)_3 [99 - 137] 
+TSLQTAPRLVPTH
+>STRG.8.1(-)_4 [2 - 205] 
+VTPAIKDFQKAQRERSHSLKVGQSCCGLSQSLNISPNRPETGSHTLKMPITTLRILSTRR
\ No newline at end of file
--- a/test-data/test_output.fasta	Wed Jun 20 11:02:06 2018 -0400
+++ b/test-data/test_output.fasta	Mon Jul 16 11:01:52 2018 -0400
@@ -1,4 +1,6 @@
->14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE)
-VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC
->103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE)
-HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNFSDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA
\ No newline at end of file
+>STRG.4.1(-)_12 [1 - 666]
+DPTTVYVDMRALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDSHLGEIWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLGFLRRTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHKSVTECAVFTWTNLLVVVVELDGSEQEALDLVPLVTNVVLEEHYLIVGVVVVVDIGVIPINSRGEKQR
+>STRG.6.1(-)_5 [2 - 250]
+TGMLAGVKMSHAATSAFCRSIKLQCELYPSREVAICLDPYCGLGFVLWCLCSVYSGHQSILIPPSELETNPALWLLAVSQYKV
+>STRG.8.1(-)_4 [2 - 205]
+VTPAIKDFQKAQRERSHSLKVGQSCCGLSQSLNISPNRPETGSHTLKMPITTLRILSTRR
\ No newline at end of file
--- a/test-data/test_output.tab	Wed Jun 20 11:02:06 2018 -0400
+++ b/test-data/test_output.tab	Mon Jul 16 11:01:52 2018 -0400
@@ -1,60 +1,46 @@
-seqID	start	end	length	orientation	longest
-14520830-14521117(-)	2	37	35	Forward	n
-14520830-14521117(-)	3	47	44	Forward	n
-14520830-14521117(-)	41	94	53	Forward	n
-14520830-14521117(-)	63	152	89	Forward	n
-14520830-14521117(-)	146	175	29	Forward	n
-14520830-14521117(-)	156	200	44	Forward	n
-14520830-14521117(-)	1	240	239	Forward	n
-14520830-14521117(-)	179	325	146	Forward	n
-14520830-14521117(-)	204	335	131	Forward	n
-14520830-14521117(-)	329	382	53	Forward	n
-14520830-14521117(-)	351	440	89	Forward	n
-14520830-14521117(-)	434	463	29	Forward	n
-14520830-14521117(-)	444	488	44	Forward	n
-14520830-14521117(-)	244	528	284	Forward	n
-14520830-14521117(-)	467	574	107	Forward	n
-14520830-14521117(-)	492	575	83	Forward	n
-14520830-14521117(-)	532	576	44	Forward	n
-14520830-14521117(-)	575	543	32	Forward	n
-14520830-14521117(-)	574	524	50	Forward	n
-14520830-14521117(-)	576	466	110	Forward	n
-14520830-14521117(-)	520	458	62	Forward	n
-14520830-14521117(-)	454	401	53	Forward	n
-14520830-14521117(-)	397	359	38	Forward	n
-14520830-14521117(-)	355	311	44	Forward	n
-14520830-14521117(-)	539	255	284	Forward	n
-14520830-14521117(-)	307	236	71	Forward	n
-14520830-14521117(-)	462	178	284	Forward	y
-14520830-14521117(-)	232	170	62	Forward	n
-14520830-14521117(-)	166	113	53	Forward	n
-14520830-14521117(-)	109	71	38	Forward	n
-14520830-14521117(-)	67	23	44	Forward	n
-14520830-14521117(-)	251	3	248	Forward	n
-14520830-14521117(-)	174	1	173	Forward	n
-103089310-103089560(-)	2	37	35	Reverse	n
-103089310-103089560(-)	24	92	68	Reverse	n
-103089310-103089560(-)	13	117	104	Reverse	n
-103089310-103089560(-)	138	167	29	Reverse	n
-103089310-103089560(-)	180	260	80	Reverse	n
-103089310-103089560(-)	41	271	230	Reverse	n
-103089310-103089560(-)	127	288	161	Reverse	n
-103089310-103089560(-)	275	343	68	Reverse	n
-103089310-103089560(-)	264	368	104	Reverse	n
-103089310-103089560(-)	389	418	29	Reverse	n
-103089310-103089560(-)	378	500	122	Reverse	n
-103089310-103089560(-)	292	501	209	Reverse	n
-103089310-103089560(-)	431	502	71	Reverse	n
-103089310-103089560(-)	500	447	53	Reverse	n
-103089310-103089560(-)	480	436	44	Reverse	n
-103089310-103089560(-)	502	383	119	Reverse	n
-103089310-103089560(-)	426	361	65	Reverse	n
-103089310-103089560(-)	357	289	68	Reverse	n
-103089310-103089560(-)	379	233	146	Reverse	n
-103089310-103089560(-)	279	196	83	Reverse	n
-103089310-103089560(-)	229	185	44	Reverse	n
-103089310-103089560(-)	443	132	311	Reverse	y
-103089310-103089560(-)	175	110	65	Reverse	n
-103089310-103089560(-)	106	38	68	Reverse	n
-103089310-103089560(-)	128	3	125	Reverse	n
-103089310-103089560(-)	192	1	191	Reverse	n
+seqID	transcript	orf_start	orf_end	length	strand	sense	longest
+STRG.4.1(-)_1	STRG.4.1	3	77	74	-	normal	n
+STRG.4.1(-)_2	STRG.4.1	59	88	29	-	normal	n
+STRG.4.1(-)_3	STRG.4.1	92	127	35	-	normal	n
+STRG.4.1(-)_4	STRG.4.1	131	268	137	-	normal	n
+STRG.4.1(-)_5	STRG.4.1	81	332	251	-	normal	n
+STRG.4.1(-)_6	STRG.4.1	272	379	107	-	normal	n
+STRG.4.1(-)_7	STRG.4.1	366	437	71	-	normal	n
+STRG.4.1(-)_8	STRG.4.1	465	518	53	-	normal	n
+STRG.4.1(-)_9	STRG.4.1	383	565	182	-	normal	n
+STRG.4.1(-)_10	STRG.4.1	599	664	65	-	normal	n
+STRG.4.1(-)_11	STRG.4.1	522	665	143	-	normal	n
+STRG.4.1(-)_12	STRG.4.1	1	666	665	-	normal	y
+STRG.4.1(-)_13	STRG.4.1	665	594	71	-	reverse_sense	n
+STRG.4.1(-)_14	STRG.4.1	631	491	140	-	reverse_sense	n
+STRG.4.1(-)_15	STRG.4.1	590	375	215	-	reverse_sense	n
+STRG.4.1(-)_16	STRG.4.1	448	341	107	-	reverse_sense	n
+STRG.4.1(-)_17	STRG.4.1	371	312	59	-	reverse_sense	n
+STRG.4.1(-)_18	STRG.4.1	666	271	395	-	reverse_sense	n
+STRG.4.1(-)_19	STRG.4.1	337	260	77	-	reverse_sense	n
+STRG.4.1(-)_20	STRG.4.1	308	255	53	-	reverse_sense	n
+STRG.4.1(-)_21	STRG.4.1	256	224	32	-	reverse_sense	n
+STRG.4.1(-)_22	STRG.4.1	246	169	77	-	reverse_sense	n
+STRG.4.1(-)_23	STRG.4.1	217	128	89	-	reverse_sense	n
+STRG.4.1(-)_24	STRG.4.1	188	78	110	-	reverse_sense	n
+STRG.4.1(-)_25	STRG.4.1	165	58	107	-	reverse_sense	n
+STRG.4.1(-)_26	STRG.4.1	74	18	56	-	reverse_sense	n
+STRG.4.1(-)_27	STRG.4.1	124	2	122	-	reverse_sense	n
+STRG.4.1(-)_28	STRG.4.1	54	1	53	-	reverse_sense	n
+STRG.6.1(-)_1	STRG.6.1	1	63	62	-	normal	n
+STRG.6.1(-)_2	STRG.6.1	24	182	158	-	normal	n
+STRG.6.1(-)_3	STRG.6.1	94	195	101	-	normal	n
+STRG.6.1(-)_4	STRG.6.1	186	233	47	-	normal	n
+STRG.6.1(-)_5	STRG.6.1	2	250	248	-	normal	y
+STRG.6.1(-)_6	STRG.6.1	199	252	53	-	normal	n
+STRG.6.1(-)_7	STRG.6.1	237	121	116	-	reverse_sense	n
+STRG.6.1(-)_8	STRG.6.1	251	93	158	-	reverse_sense	n
+STRG.6.1(-)_9	STRG.6.1	117	85	32	-	reverse_sense	n
+STRG.6.1(-)_10	STRG.6.1	81	34	47	-	reverse_sense	n
+STRG.6.1(-)_11	STRG.6.1	250	14	236	-	reverse_sense	n
+STRG.6.1(-)_12	STRG.6.1	62	3	59	-	reverse_sense	n
+STRG.6.1(-)_13	STRG.6.1	30	1	29	-	reverse_sense	n
+STRG.8.1(-)_1	STRG.8.1	18	56	38	-	normal	n
+STRG.8.1(-)_2	STRG.8.1	60	95	35	-	normal	n
+STRG.8.1(-)_3	STRG.8.1	99	137	38	-	normal	n
+STRG.8.1(-)_4	STRG.8.1	2	205	203	-	normal	y