Mercurial > repos > mbernt > longorf
view getLongestORF.py @ 1:1c4b24e9bb16 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author | mbernt |
---|---|
date | Mon, 16 Jul 2018 11:01:52 -0400 |
parents | ec898924d8c7 |
children |
line wrap: on
line source
#!/usr/bin/env python #example: #>STRG.1.1(-)_1 [10 - 69] #GGNHHTLGGKKTFSYTHPPC #>STRG.1.1(-)_2 [3 - 80] #FLRGEPPHIGGKKDIFLHPPTLLKGR #output1: fasta file with all longest ORFs per transcript #output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs import sys,re; def findlongestOrf(transcriptDict,old_seqID): #write for previous seqID prevTranscript = transcriptDict[old_seqID]; i_max = 0; transcript = old_seqID.split("(")[0] #find longest orf in transcript for i in range(0,len(prevTranscript)): if(prevTranscript[i][2] >= prevTranscript[i_max][2]): i_max = i; for i in range(0,len(prevTranscript)): prevORFstart = prevTranscript[i][0]; prevORFend = prevTranscript[i][1]; prevORFlength = prevTranscript[i][2]; header = prevTranscript[i][3]; strand = re.search('\(([+-]+)\)',header).group(1); output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand); if (prevORFend - prevORFstart > 0): output+="\tnormal"; else: output+="\treverse_sense"; if(i == i_max): output += "\ty\n"; else: output += "\tn\n"; OUTPUT_ORF_SUMMARY.write(output); transcriptDict.pop(old_seqID, None); return None; #----------------------------------------------------------------------------------------------------- INPUT = open(sys.argv[1],"r"); OUTPUT_FASTA = open(sys.argv[2],"w"); OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w"); seqID = ""; old_seqID = ""; lengthDict = {}; seqDict = {}; headerDict = {}; transcriptDict = {}; skip = False; OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n"); for line in INPUT: line = line.strip(); if(re.match(">",line)): #header header = line.split(">")[1].split(" ")[0] seqID = "_".join(line.split(">")[1].split("_")[:-1]) ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1)); ORFend = int (re.search('-\ (\d+)\]',line).group(1)); length = abs(ORFend - ORFstart); if(seqID not in transcriptDict and old_seqID != ""): #new transcript findlongestOrf(transcriptDict,old_seqID); if seqID not in transcriptDict: transcriptDict[seqID] = []; transcriptDict[seqID].append([ORFstart,ORFend,length,header]); if(seqID not in lengthDict and old_seqID != ""): #new transcript #write FASTA OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n"); #delete old dict entry headerDict.pop(old_seqID, None); seqDict.pop(old_seqID, None); lengthDict.pop(old_seqID, None); #if several longest sequences exist with the same length, the dictionary saves the last occuring. if(seqID not in lengthDict or length >= lengthDict[seqID]): headerDict[seqID] = line; lengthDict[seqID] = length; seqDict[seqID] = ""; skip = False; else: skip = True; next; old_seqID = seqID; elif(skip): next; else: seqDict[seqID] += line; OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]); findlongestOrf(transcriptDict,old_seqID); INPUT.close(); OUTPUT_FASTA.close(); OUTPUT_ORF_SUMMARY.close();