Mercurial > repos > mbernt > longorf
changeset 1:1c4b24e9bb16 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author | mbernt |
---|---|
date | Mon, 16 Jul 2018 11:01:52 -0400 |
parents | ec898924d8c7 |
children | 4952f1ece60b |
files | getLongestORF.py test-data/test_input.fasta test-data/test_output.fasta test-data/test_output.tab |
diffstat | 4 files changed, 248 insertions(+), 293 deletions(-) [+] |
line wrap: on
line diff
--- a/getLongestORF.py Wed Jun 20 11:02:06 2018 -0400 +++ b/getLongestORF.py Mon Jul 16 11:01:52 2018 -0400 @@ -1,114 +1,108 @@ #!/usr/bin/env python -""" -usage: getLongestORF.py input output.fas output.tab - - -input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") -output.fas: fasta file with all longest ORFs per transcript -output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs - -example: +#example: +#>STRG.1.1(-)_1 [10 - 69] +#GGNHHTLGGKKTFSYTHPPC +#>STRG.1.1(-)_2 [3 - 80] +#FLRGEPPHIGGKKDIFLHPPTLLKGR ->253936-254394(+)_1 [28 - 63] -LTNYCQMVHNIL ->253936-254394(+)_2 [18 - 77] -HKLIDKLLPNGAQYFVKSTQ ->253936-254394(+)_3 [32 - 148] -QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV ->253936-254394(+)_4 [117 - 152] -LGIIVPSLLLCN ->248351-252461(+)_1 [14 - 85] -VLARKYPRCLSPSKKSPCQLRQRS ->248351-252461(+)_2 [21 - 161] -PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP ->248351-252461(+)_3 [89 - 202] -VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG ->248351-252461(+)_4 [206 - 259] -SRTTSGRQSVLSEKLWRR ->248351-252461(+)_5 [263 - 313] -CLSPLWVPCCSRHSCHG -""" +#output1: fasta file with all longest ORFs per transcript +#output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs -import sys,re +import sys,re; def findlongestOrf(transcriptDict,old_seqID): - #write for previous seqID - prevTranscript = transcriptDict[old_seqID] - i_max = 0 - #find longest orf in transcript - for i in range(0,len(prevTranscript)): - if(prevTranscript[i][2] >= prevTranscript[i_max][2]): - i_max = i - for i in range(0,len(prevTranscript)): - prevStart = prevTranscript[i][0] - prevEnd = prevTranscript[i][1] - prevLength = prevTranscript[i][2] - output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) - if (end - start > 0): - output+="\tForward" - else: - output+="\tReverse" - if(i == i_max): - output += "\ty\n" - else: - output += "\tn\n" - OUTPUT_ORF_SUMMARY.write(output) - transcriptDict.pop(old_seqID, None) - return None + #write for previous seqID + prevTranscript = transcriptDict[old_seqID]; + i_max = 0; + transcript = old_seqID.split("(")[0] + + #find longest orf in transcript + for i in range(0,len(prevTranscript)): + if(prevTranscript[i][2] >= prevTranscript[i_max][2]): + i_max = i; -INPUT = open(sys.argv[1],"r") -OUTPUT_FASTA = open(sys.argv[2],"w") -OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") + for i in range(0,len(prevTranscript)): + prevORFstart = prevTranscript[i][0]; + prevORFend = prevTranscript[i][1]; + prevORFlength = prevTranscript[i][2]; + header = prevTranscript[i][3]; + strand = re.search('\(([+-]+)\)',header).group(1); + + output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand); + if (prevORFend - prevORFstart > 0): + output+="\tnormal"; + else: + output+="\treverse_sense"; + if(i == i_max): + output += "\ty\n"; + else: + output += "\tn\n"; + + OUTPUT_ORF_SUMMARY.write(output); -seqID = "" -old_seqID = "" -lengthDict = {} -seqDict = {} -headerDict = {} -transcriptDict = {} -skip = False + transcriptDict.pop(old_seqID, None); + return None; + +#----------------------------------------------------------------------------------------------------- + +INPUT = open(sys.argv[1],"r"); +OUTPUT_FASTA = open(sys.argv[2],"w"); +OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w"); -OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") +seqID = ""; +old_seqID = ""; +lengthDict = {}; +seqDict = {}; +headerDict = {}; +transcriptDict = {}; + +skip = False; + +OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n"); for line in INPUT: - line = line.strip() -# print line - if(re.match(">",line)): #header - seqID = "_".join(line.split(">")[1].split("_")[:-1]) - #seqID = line.split(">")[1].split("_")[0] - start = int (re.search('\ \[(\d+)\ -', line).group(1)) - end = int (re.search('-\ (\d+)\]',line).group(1)) - length = abs(end - start) - if(seqID not in transcriptDict and old_seqID != ""): #new transcript - findlongestOrf(transcriptDict,old_seqID) - if seqID not in transcriptDict: - transcriptDict[seqID] = [] - transcriptDict[seqID].append([start,end,length]) - if(seqID not in lengthDict and old_seqID != ""): #new transcript - #write FASTA - OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") - #delete old dict entry - headerDict.pop(old_seqID, None) - seqDict.pop(old_seqID, None) - lengthDict.pop(old_seqID, None) - #if several longest sequences exist with the same length, the dictionary saves the last occuring. - if(seqID not in lengthDict or length >= lengthDict[seqID]): - headerDict[seqID] = line - lengthDict[seqID] = length - seqDict[seqID] = "" - skip = False - else: - skip = True - next - old_seqID = seqID - elif(skip): - next - else: - seqDict[seqID] += line + line = line.strip(); + if(re.match(">",line)): #header + header = line.split(">")[1].split(" ")[0] + seqID = "_".join(line.split(">")[1].split("_")[:-1]) + ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1)); + ORFend = int (re.search('-\ (\d+)\]',line).group(1)); + length = abs(ORFend - ORFstart); + + if(seqID not in transcriptDict and old_seqID != ""): #new transcript + findlongestOrf(transcriptDict,old_seqID); + + if seqID not in transcriptDict: + transcriptDict[seqID] = []; + + transcriptDict[seqID].append([ORFstart,ORFend,length,header]); -OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) -findlongestOrf(transcriptDict,old_seqID) -INPUT.close() -OUTPUT_FASTA.close() -OUTPUT_ORF_SUMMARY.close() + if(seqID not in lengthDict and old_seqID != ""): #new transcript + #write FASTA + OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n"); + #delete old dict entry + headerDict.pop(old_seqID, None); + seqDict.pop(old_seqID, None); + lengthDict.pop(old_seqID, None); + #if several longest sequences exist with the same length, the dictionary saves the last occuring. + if(seqID not in lengthDict or length >= lengthDict[seqID]): + headerDict[seqID] = line; + lengthDict[seqID] = length; + seqDict[seqID] = ""; + skip = False; + else: + skip = True; + next; + old_seqID = seqID; + elif(skip): + next; + else: + seqDict[seqID] += line; + +OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]); +findlongestOrf(transcriptDict,old_seqID); + +INPUT.close(); +OUTPUT_FASTA.close(); +OUTPUT_ORF_SUMMARY.close(); \ No newline at end of file
--- a/test-data/test_input.fasta Wed Jun 20 11:02:06 2018 -0400 +++ b/test-data/test_input.fasta Mon Jul 16 11:01:52 2018 -0400 @@ -1,127 +1,100 @@ ->14520830-14521117(-)_1 [2 - 37] -KPLENISASREF ->14520830-14521117(-)_2 [3 - 47] -SPWRIFQPAENFDLQ ->14520830-14521117(-)_3 [41 - 94] -LAVGFGLIFLRSGWMPCL ->14520830-14521117(-)_4 [63 - 152] -FSYDLGGCLACDSCSSYSPNEGQCPARKLE ->14520830-14521117(-)_5 [146 - 175] -VGMMDLCSET ->14520830-14521117(-)_6 [156 - 200] -WTCVQRLNRTNKQNK ->14520830-14521117(-)_7 [1 - 240] -KAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSAPHVSWNDGPVFRDLT -EPTSKTSENRKKEEDTGINS ->14520830-14521117(-)_8 [179 - 325] -QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRGTKPLENISASREF ->14520830-14521117(-)_9 [204 - 335] -EQEKGRGHWNQFLRVALLGFSSIPSFVGQSPWRIFQPAENFDLQ ->14520830-14521117(-)_10 [329 - 382] -LAVGFGLIFLRSGWMPCL ->14520830-14521117(-)_11 [351 - 440] -FSYDLGGCLACDSCSSYSPNEGQCPARKLE ->14520830-14521117(-)_12 [434 - 463] -VGMMDLCSET ->14520830-14521117(-)_13 [444 - 488] -WTCVQRLNRTNKQNK ->14520830-14521117(-)_14 [244 - 528] -ELHYLVFLPFQVSWDKAPGEYFSQQRILTCSRIWFDFPTIWVDALPVTVAVPIRQMKGSA -PHVSWNDGPVFRDLTEPTSKTSENRKKEEDTGINS ->14520830-14521117(-)_15 [467 - 574] -QNQQAKQVRTGKRKRTLESILESCTTWFFFHSKFRG ->14520830-14521117(-)_16 [492 - 575] -EQEKGRGHWNQFLRVALLGFSSIPSFVG ->14520830-14521117(-)_17 [532 - 576] -ELHYLVFLPFQVSWD ->14520830-14521117(-)_18 [575 - 543] (REVERSE SENSE) -SHETWNGRKTK ->14520830-14521117(-)_19 [574 - 524] (REVERSE SENSE) -PTKLGMEEKPSSATLKN ->14520830-14521117(-)_20 [576 - 466] (REVERSE SENSE) -VPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC ->14520830-14521117(-)_21 [520 - 458] (REVERSE SENSE) -FQCPLPFSCSHLFCLLVLLSL ->14520830-14521117(-)_22 [454 - 401] (REVERSE SENSE) -TQVHHSNLRAGHCPSFGE ->14520830-14521117(-)_23 [397 - 359] (REVERSE SENSE) -ELQLSQARHPPRS ->14520830-14521117(-)_24 [355 - 311] (REVERSE SENSE) -ENQTKSYCKSKFSAG ->14520830-14521117(-)_25 [539 - 255] (REVERSE SENSE) -CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI -VGKSNQILLQVKILCWLKYSPGALSHETWNGRKTK ->14520830-14521117(-)_26 [307 - 236] (REVERSE SENSE) -NILQGLCPTKLGMEEKPSSATLKN ->14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE) -VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVP -RNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC ->14520830-14521117(-)_28 [232 - 170] (REVERSE SENSE) -FQCPLPFSCSHLFCLLVLLSL ->14520830-14521117(-)_29 [166 - 113] (REVERSE SENSE) -TQVHHSNLRAGHCPSFGE ->14520830-14521117(-)_30 [109 - 71] (REVERSE SENSE) -ELQLSQARHPPRS ->14520830-14521117(-)_31 [67 - 23] (REVERSE SENSE) -ENQTKSYCKSKFSAG ->14520830-14521117(-)_32 [251 - 3] (REVERSE SENSE) -CNSQELIPVSSSFFLFSLVLLVGSVKSLNTGPSFQLTCGALPFIWRIGTATVTGKASTQI -VGKSNQILLQVKILCWLKYSPGA ->14520830-14521117(-)_33 [174 - 1] (REVERSE SENSE) -VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGF ->103089310-103089560(-)_1 [2 - 37] -GTSEKFLKILLS ->103089310-103089560(-)_2 [24 - 92] -RFYYHRYLFWFCVSVLSADGPKL ->103089310-103089560(-)_3 [13 - 117] -KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK ->103089310-103089560(-)_4 [138 - 167] -TACIWLHCGL ->103089310-103089560(-)_5 [180 - 260] -NHPYVSVSGYTRKRKESQSGTKSGRYV ->103089310-103089560(-)_6 [41 - 271] -ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK -ERVSEWNKEWEVRLKSF ->103089310-103089560(-)_7 [127 - 288] -NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRVGGTSEKFLKILLS ->103089310-103089560(-)_8 [275 - 343] -RFYYHRYLFWFCVSVLSADGPKL ->103089310-103089560(-)_9 [264 - 368] -KVSEDFIIIDTCFGFVYLYSLQMVQNCNGVCIRRK ->103089310-103089560(-)_10 [389 - 418] -TACIWLHCGL ->103089310-103089560(-)_11 [378 - 500] -NQAELHVYGSTVACDTFKIIRMSVYLDTRGKGKSLRVEQRV ->103089310-103089560(-)_12 [292 - 501] -ILVLVLCICTLCRWSKIVMESVLEENKGKIRLNCMYMAPLWLVTLLKSSVCQCIWIHEEK -ERVSEWNKEW ->103089310-103089560(-)_13 [431 - 502] -NHPYVSVSGYTRKRKESQSGTKSG ->103089310-103089560(-)_14 [500 - 447] (REVERSE SENSE) -HSLFHSETLSFSSCIQIH ->103089310-103089560(-)_15 [480 - 436] (REVERSE SENSE) -DSFLFLVYPDTLTYG ->103089310-103089560(-)_16 [502 - 383] (REVERSE SENSE) -PTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA ->103089310-103089560(-)_17 [426 - 361] (REVERSE SENSE) -KCHKPQWSHIHAVQPDFTLIFF ->103089310-103089560(-)_18 [357 - 289] (REVERSE SENSE) -YRLHYNFGPSAESTDTQNQNKYL ->103089310-103089560(-)_19 [379 - 233] (REVERSE SENSE) -FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTYLPLFVPL ->103089310-103089560(-)_20 [279 - 196] (REVERSE SENSE) -NLQKLFRRTSHSLFHSETLSFSSCIQIH ->103089310-103089560(-)_21 [229 - 185] (REVERSE SENSE) -DSFLFLVYPDTLTYG ->103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE) -HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF -SDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA ->103089310-103089560(-)_23 [175 - 110] (REVERSE SENSE) -KCHKPQWSHIHAVQPDFTLIFF ->103089310-103089560(-)_24 [106 - 38] (REVERSE SENSE) -YRLHYNFGPSAESTDTQNQNKYL ->103089310-103089560(-)_25 [128 - 3] (REVERSE SENSE) -FYPYFLLIQTPLQFWTICREYRYTKPKQVSMIIKSSETFQTY ->103089310-103089560(-)_26 [192 - 1] (REVERSE SENSE) -HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNF -SDVP +>STRG.4.1(-)_1 [3 - 77] +PNHCLRGHESPETRQSPLSGKRIPS +>STRG.4.1(-)_2 [59 - 88] +WKEDPLIVCP +>STRG.4.1(-)_3 [92 - 127] +WNRERYFQGFGL +>STRG.4.1(-)_4 [131 - 268] +LPTQKQKDRWGTHTLERFGFTVPTMPAVISLFTETNPSSQITSTQD +>STRG.4.1(-)_5 [81 - 332] +SAPDGIGKDTSRGSDYNCQPRNKRTAGGLTPWRDLGSQCPQCQRLFHYLRRRIPPVRSLQ +LKTKFWRHPDHLGTHRLLGVPAEN +>STRG.4.1(-)_6 [272 - 379] +VLETPRPSGHAQATWGSCGELSSQMQMESAMMPSTW +>STRG.4.1(-)_7 [366 - 437] +CPLRGRGTGRSHGAAGHAVPPNRH +>STRG.4.1(-)_8 [465 - 518] +KRYGMCCVYLDKFVGGCG +>STRG.4.1(-)_9 [383 - 565] +GHWTKPWSCGACGTTQSTLRPRSSEPIKALRNVLCLPGQICWWLWLSWMGRNKKPWTWFP +W +>STRG.4.1(-)_10 [599 - 664] +SSEWWSWWTSASSPSTPVGRSS +>STRG.4.1(-)_11 [522 - 665] +AGWVGTRSLGPGSLGDQRGPGGALPDRRSGGRGGHRRHPHQLPWGEAA +>STRG.4.1(-)_12 [1 - 666] +DPTTVYVDMRALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDSHLGE +IWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLGFLRRTELTDANGER +HDALYVVGALDEAMELRGMRYHPIDIETSVIRAHKSVTECAVFTWTNLLVVVVELDGSEQ +EALDLVPLVTNVVLEEHYLIVGVVVVVDIGVIPINSRGEKQR +>STRG.4.1(-)_13 [665 - 594] (REVERSE SENSE) +RCFSPRELMGMTPMSTTTTTPTIR +>STRG.4.1(-)_14 [631 - 491] (REVERSE SENSE) +RRCPPRPPLRRSGSAPPGPRWSPREPGPRLLVPTHPAQPQPPTNLSR +>STRG.4.1(-)_15 [590 - 375] (REVERSE SENSE) +CSSRTTLVTKGTRSKASCSDPSSSTTTTNKFVQVNTAHSVTLLWALMTEVSMSIGWYRMP +RSSMASSSAPTT +>STRG.4.1(-)_16 [448 - 341] (REVERSE SENSE) +PRSQCRLGGTACPAAPWLRPVPLPRRGHHGALHLHL +>STRG.4.1(-)_17 [371 - 312] (REVERSE SENSE) +RASWRSPFASVSSVLRRNPK +>STRG.4.1(-)_18 [666 - 271] (REVERSE SENSE) +ALLLPTGVDGDDADVHHDHHSDDQVVLLQDHVGHQGNQVQGFLFRPIQLNHNHQQICPGK +HSTFRNAFMGSDDRGLNVDWVVPHAPQLHGFVQCPYHVEGIMALSICICELSSPQEPQVA +CACPDGLGVSKT +>STRG.4.1(-)_19 [337 - 260] (REVERSE SENSE) +AQFSAGTPSSLCVPRWSGCLQNLVLS +>STRG.4.1(-)_20 [308 - 255] (REVERSE SENSE) +PVRAQMVWVSPKLSLELK +>STRG.4.1(-)_21 [256 - 224] (REVERSE SENSE) +SDLTGGIRLRK +>STRG.4.1(-)_22 [246 - 169] (REVERSE SENSE) +LEGFVSVNSEITAGIVGTVNPNLSKV +>STRG.4.1(-)_23 [217 - 128] (REVERSE SENSE) +NNRWHCGHCEPKSLQGVSPPAVLLFLGWQL +>STRG.4.1(-)_24 [188 - 78] (REVERSE SENSE) +TQISPRCESPSGPFVSGLAIIIRTPGSIFPDSIRGRL +>STRG.4.1(-)_25 [165 - 58] (REVERSE SENSE) +VPQRSFCFWVGNYNPNPWKYLSRFHQGQTMRGSSFH +>STRG.4.1(-)_26 [74 - 18] (REVERSE SENSE) +GDPLSTKRTLSCLRALMST +>STRG.4.1(-)_27 [124 - 2] (REVERSE SENSE) +SEPLEVSFPIPSGADYEGILFPLSGLCRVSGLSCPRRQWLG +>STRG.4.1(-)_28 [54 - 1] (REVERSE SENSE) +ADSVVSQGSHVHVDSGWV +>STRG.6.1(-)_1 [1 - 63] +NWDASWRKDVSRSHQCLLPFH +>STRG.6.1(-)_2 [24 - 182] +RCLTQPPVPSAVPLSCSVNFTPLEKWPSAWTLTVDWDLSSGASAVCILGTSPS +>STRG.6.1(-)_3 [94 - 195] +RSGHLPGPLLWTGICPLVPLQCVFWAPVHPDPAL +>STRG.6.1(-)_4 [186 - 233] +SRPLSWKPTPPCGFLP +>STRG.6.1(-)_5 [2 - 250] +TGMLAGVKMSHAATSAFCRSIKLQCELYPSREVAICLDPYCGLGFVLWCLCSVYSGHQSI +LIPPSELETNPALWLLAVSQYKV +>STRG.6.1(-)_6 [199 - 252] +AGNQPRLVASCRESVQSP +>STRG.6.1(-)_7 [237 - 121] (REVERSE SENSE) +LTARSHKAGLVSSSEGGIRMDWCPEYTLQRHQRTNPSPQ +>STRG.6.1(-)_8 [251 - 93] (REVERSE SENSE) +GLCTDSRQEATRRGWFPAQRAGSGWTGAQNTHCRGTRGQIPVHSKGPGRWPLL +>STRG.6.1(-)_9 [117 - 85] (REVERSE SENSE) +GSRQMATSLEG +>STRG.6.1(-)_10 [81 - 34] (REVERSE SENSE) +SSHCSLMERQKALVAA +>STRG.6.1(-)_11 [250 - 14] (REVERSE SENSE) +DFVLTHGKKPQGGVGFQLRGRDQDGLVPRIHTAEAPEDKSQSTVRVQADGHFSRGVKFTL +QLNGTAEGTGGCVRHLYAS +>STRG.6.1(-)_12 [62 - 3] (REVERSE SENSE) +WNGRRHWWLRETSLRQLASQ +>STRG.6.1(-)_13 [30 - 1] (REVERSE SENSE) +DIFTPASIPV +>STRG.8.1(-)_1 [18 - 56] +RTSKKPNGRDPTV +>STRG.8.1(-)_2 [60 - 95] +RLAKAAVVCHRV +>STRG.8.1(-)_3 [99 - 137] +TSLQTAPRLVPTH +>STRG.8.1(-)_4 [2 - 205] +VTPAIKDFQKAQRERSHSLKVGQSCCGLSQSLNISPNRPETGSHTLKMPITTLRILSTRR \ No newline at end of file
--- a/test-data/test_output.fasta Wed Jun 20 11:02:06 2018 -0400 +++ b/test-data/test_output.fasta Mon Jul 16 11:01:52 2018 -0400 @@ -1,4 +1,6 @@ ->14520830-14521117(-)_27 [462 - 178] (REVERSE SENSE) -VSEHRSIIPTYVRGTALHLANRNCNCHRQGIHPDRRKIKPNPTASQNSLLAEIFSRGFVPRNLEWKKNQVVQLSRIDSSVLFLFPVLTCFACWFC ->103089310-103089560(-)_22 [443 - 132] (REVERSE SENSE) -HTDDFKSVTSHSGAIYMQFSLILPLFSSNTDSITILDHLQRVQIHKTKTSIYDNKIFRNFSDVPPTLCSTLRLFPFPRVSRYTDIRMILKVSQATVEPYTCSSA \ No newline at end of file +>STRG.4.1(-)_12 [1 - 666] +DPTTVYVDMRALRHDRVRLVERGSPHSLPLMESGKILPGVRIIIANPETKGPLGDSHLGEIWVHSAHNASGYFTIYGDESLQSDHFNSRLSFGDTQTIWARTGYLGFLRRTELTDANGERHDALYVVGALDEAMELRGMRYHPIDIETSVIRAHKSVTECAVFTWTNLLVVVVELDGSEQEALDLVPLVTNVVLEEHYLIVGVVVVVDIGVIPINSRGEKQR +>STRG.6.1(-)_5 [2 - 250] +TGMLAGVKMSHAATSAFCRSIKLQCELYPSREVAICLDPYCGLGFVLWCLCSVYSGHQSILIPPSELETNPALWLLAVSQYKV +>STRG.8.1(-)_4 [2 - 205] +VTPAIKDFQKAQRERSHSLKVGQSCCGLSQSLNISPNRPETGSHTLKMPITTLRILSTRR \ No newline at end of file
--- a/test-data/test_output.tab Wed Jun 20 11:02:06 2018 -0400 +++ b/test-data/test_output.tab Mon Jul 16 11:01:52 2018 -0400 @@ -1,60 +1,46 @@ -seqID start end length orientation longest -14520830-14521117(-) 2 37 35 Forward n -14520830-14521117(-) 3 47 44 Forward n -14520830-14521117(-) 41 94 53 Forward n -14520830-14521117(-) 63 152 89 Forward n -14520830-14521117(-) 146 175 29 Forward n -14520830-14521117(-) 156 200 44 Forward n -14520830-14521117(-) 1 240 239 Forward n -14520830-14521117(-) 179 325 146 Forward n -14520830-14521117(-) 204 335 131 Forward n -14520830-14521117(-) 329 382 53 Forward n -14520830-14521117(-) 351 440 89 Forward n -14520830-14521117(-) 434 463 29 Forward n -14520830-14521117(-) 444 488 44 Forward n -14520830-14521117(-) 244 528 284 Forward n -14520830-14521117(-) 467 574 107 Forward n -14520830-14521117(-) 492 575 83 Forward n -14520830-14521117(-) 532 576 44 Forward n -14520830-14521117(-) 575 543 32 Forward n -14520830-14521117(-) 574 524 50 Forward n -14520830-14521117(-) 576 466 110 Forward n -14520830-14521117(-) 520 458 62 Forward n -14520830-14521117(-) 454 401 53 Forward n -14520830-14521117(-) 397 359 38 Forward n -14520830-14521117(-) 355 311 44 Forward n -14520830-14521117(-) 539 255 284 Forward n -14520830-14521117(-) 307 236 71 Forward n -14520830-14521117(-) 462 178 284 Forward y -14520830-14521117(-) 232 170 62 Forward n -14520830-14521117(-) 166 113 53 Forward n -14520830-14521117(-) 109 71 38 Forward n -14520830-14521117(-) 67 23 44 Forward n -14520830-14521117(-) 251 3 248 Forward n -14520830-14521117(-) 174 1 173 Forward n -103089310-103089560(-) 2 37 35 Reverse n -103089310-103089560(-) 24 92 68 Reverse n -103089310-103089560(-) 13 117 104 Reverse n -103089310-103089560(-) 138 167 29 Reverse n -103089310-103089560(-) 180 260 80 Reverse n -103089310-103089560(-) 41 271 230 Reverse n -103089310-103089560(-) 127 288 161 Reverse n -103089310-103089560(-) 275 343 68 Reverse n -103089310-103089560(-) 264 368 104 Reverse n -103089310-103089560(-) 389 418 29 Reverse n -103089310-103089560(-) 378 500 122 Reverse n -103089310-103089560(-) 292 501 209 Reverse n -103089310-103089560(-) 431 502 71 Reverse n -103089310-103089560(-) 500 447 53 Reverse n -103089310-103089560(-) 480 436 44 Reverse n -103089310-103089560(-) 502 383 119 Reverse n -103089310-103089560(-) 426 361 65 Reverse n -103089310-103089560(-) 357 289 68 Reverse n -103089310-103089560(-) 379 233 146 Reverse n -103089310-103089560(-) 279 196 83 Reverse n -103089310-103089560(-) 229 185 44 Reverse n -103089310-103089560(-) 443 132 311 Reverse y -103089310-103089560(-) 175 110 65 Reverse n -103089310-103089560(-) 106 38 68 Reverse n -103089310-103089560(-) 128 3 125 Reverse n -103089310-103089560(-) 192 1 191 Reverse n +seqID transcript orf_start orf_end length strand sense longest +STRG.4.1(-)_1 STRG.4.1 3 77 74 - normal n +STRG.4.1(-)_2 STRG.4.1 59 88 29 - normal n +STRG.4.1(-)_3 STRG.4.1 92 127 35 - normal n +STRG.4.1(-)_4 STRG.4.1 131 268 137 - normal n +STRG.4.1(-)_5 STRG.4.1 81 332 251 - normal n +STRG.4.1(-)_6 STRG.4.1 272 379 107 - normal n +STRG.4.1(-)_7 STRG.4.1 366 437 71 - normal n +STRG.4.1(-)_8 STRG.4.1 465 518 53 - normal n +STRG.4.1(-)_9 STRG.4.1 383 565 182 - normal n +STRG.4.1(-)_10 STRG.4.1 599 664 65 - normal n +STRG.4.1(-)_11 STRG.4.1 522 665 143 - normal n +STRG.4.1(-)_12 STRG.4.1 1 666 665 - normal y +STRG.4.1(-)_13 STRG.4.1 665 594 71 - reverse_sense n +STRG.4.1(-)_14 STRG.4.1 631 491 140 - reverse_sense n +STRG.4.1(-)_15 STRG.4.1 590 375 215 - reverse_sense n +STRG.4.1(-)_16 STRG.4.1 448 341 107 - reverse_sense n +STRG.4.1(-)_17 STRG.4.1 371 312 59 - reverse_sense n +STRG.4.1(-)_18 STRG.4.1 666 271 395 - reverse_sense n +STRG.4.1(-)_19 STRG.4.1 337 260 77 - reverse_sense n +STRG.4.1(-)_20 STRG.4.1 308 255 53 - reverse_sense n +STRG.4.1(-)_21 STRG.4.1 256 224 32 - reverse_sense n +STRG.4.1(-)_22 STRG.4.1 246 169 77 - reverse_sense n +STRG.4.1(-)_23 STRG.4.1 217 128 89 - reverse_sense n +STRG.4.1(-)_24 STRG.4.1 188 78 110 - reverse_sense n +STRG.4.1(-)_25 STRG.4.1 165 58 107 - reverse_sense n +STRG.4.1(-)_26 STRG.4.1 74 18 56 - reverse_sense n +STRG.4.1(-)_27 STRG.4.1 124 2 122 - reverse_sense n +STRG.4.1(-)_28 STRG.4.1 54 1 53 - reverse_sense n +STRG.6.1(-)_1 STRG.6.1 1 63 62 - normal n +STRG.6.1(-)_2 STRG.6.1 24 182 158 - normal n +STRG.6.1(-)_3 STRG.6.1 94 195 101 - normal n +STRG.6.1(-)_4 STRG.6.1 186 233 47 - normal n +STRG.6.1(-)_5 STRG.6.1 2 250 248 - normal y +STRG.6.1(-)_6 STRG.6.1 199 252 53 - normal n +STRG.6.1(-)_7 STRG.6.1 237 121 116 - reverse_sense n +STRG.6.1(-)_8 STRG.6.1 251 93 158 - reverse_sense n +STRG.6.1(-)_9 STRG.6.1 117 85 32 - reverse_sense n +STRG.6.1(-)_10 STRG.6.1 81 34 47 - reverse_sense n +STRG.6.1(-)_11 STRG.6.1 250 14 236 - reverse_sense n +STRG.6.1(-)_12 STRG.6.1 62 3 59 - reverse_sense n +STRG.6.1(-)_13 STRG.6.1 30 1 29 - reverse_sense n +STRG.8.1(-)_1 STRG.8.1 18 56 38 - normal n +STRG.8.1(-)_2 STRG.8.1 60 95 35 - normal n +STRG.8.1(-)_3 STRG.8.1 99 137 38 - normal n +STRG.8.1(-)_4 STRG.8.1 2 205 203 - normal y