comparison getLongestORF.py @ 1:1c4b24e9bb16 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author mbernt
date Mon, 16 Jul 2018 11:01:52 -0400
parents ec898924d8c7
children
comparison
equal deleted inserted replaced
0:ec898924d8c7 1:1c4b24e9bb16
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 """ 3 #example:
4 usage: getLongestORF.py input output.fas output.tab 4 #>STRG.1.1(-)_1 [10 - 69]
5 #GGNHHTLGGKKTFSYTHPPC
6 #>STRG.1.1(-)_2 [3 - 80]
7 #FLRGEPPHIGGKKDIFLHPPTLLKGR
5 8
9 #output1: fasta file with all longest ORFs per transcript
10 #output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs
6 11
7 input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") 12 import sys,re;
8 output.fas: fasta file with all longest ORFs per transcript
9 output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs
10
11 example:
12
13 >253936-254394(+)_1 [28 - 63]
14 LTNYCQMVHNIL
15 >253936-254394(+)_2 [18 - 77]
16 HKLIDKLLPNGAQYFVKSTQ
17 >253936-254394(+)_3 [32 - 148]
18 QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV
19 >253936-254394(+)_4 [117 - 152]
20 LGIIVPSLLLCN
21 >248351-252461(+)_1 [14 - 85]
22 VLARKYPRCLSPSKKSPCQLRQRS
23 >248351-252461(+)_2 [21 - 161]
24 PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP
25 >248351-252461(+)_3 [89 - 202]
26 VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG
27 >248351-252461(+)_4 [206 - 259]
28 SRTTSGRQSVLSEKLWRR
29 >248351-252461(+)_5 [263 - 313]
30 CLSPLWVPCCSRHSCHG
31 """
32
33 import sys,re
34 13
35 def findlongestOrf(transcriptDict,old_seqID): 14 def findlongestOrf(transcriptDict,old_seqID):
36 #write for previous seqID 15 #write for previous seqID
37 prevTranscript = transcriptDict[old_seqID] 16 prevTranscript = transcriptDict[old_seqID];
38 i_max = 0 17 i_max = 0;
39 #find longest orf in transcript 18 transcript = old_seqID.split("(")[0]
40 for i in range(0,len(prevTranscript)):
41 if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
42 i_max = i
43 for i in range(0,len(prevTranscript)):
44 prevStart = prevTranscript[i][0]
45 prevEnd = prevTranscript[i][1]
46 prevLength = prevTranscript[i][2]
47 output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength)
48 if (end - start > 0):
49 output+="\tForward"
50 else:
51 output+="\tReverse"
52 if(i == i_max):
53 output += "\ty\n"
54 else:
55 output += "\tn\n"
56 OUTPUT_ORF_SUMMARY.write(output)
57 transcriptDict.pop(old_seqID, None)
58 return None
59 19
60 INPUT = open(sys.argv[1],"r") 20 #find longest orf in transcript
61 OUTPUT_FASTA = open(sys.argv[2],"w") 21 for i in range(0,len(prevTranscript)):
62 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") 22 if(prevTranscript[i][2] >= prevTranscript[i_max][2]):
23 i_max = i;
63 24
64 seqID = "" 25 for i in range(0,len(prevTranscript)):
65 old_seqID = "" 26 prevORFstart = prevTranscript[i][0];
66 lengthDict = {} 27 prevORFend = prevTranscript[i][1];
67 seqDict = {} 28 prevORFlength = prevTranscript[i][2];
68 headerDict = {} 29 header = prevTranscript[i][3];
69 transcriptDict = {} 30 strand = re.search('\(([+-]+)\)',header).group(1);
70 skip = False 31
32 output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand);
33 if (prevORFend - prevORFstart > 0):
34 output+="\tnormal";
35 else:
36 output+="\treverse_sense";
37 if(i == i_max):
38 output += "\ty\n";
39 else:
40 output += "\tn\n";
71 41
72 OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") 42 OUTPUT_ORF_SUMMARY.write(output);
43
44 transcriptDict.pop(old_seqID, None);
45 return None;
46
47 #-----------------------------------------------------------------------------------------------------
48
49 INPUT = open(sys.argv[1],"r");
50 OUTPUT_FASTA = open(sys.argv[2],"w");
51 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w");
52
53 seqID = "";
54 old_seqID = "";
55 lengthDict = {};
56 seqDict = {};
57 headerDict = {};
58 transcriptDict = {};
59
60 skip = False;
61
62 OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n");
73 63
74 for line in INPUT: 64 for line in INPUT:
75 line = line.strip() 65 line = line.strip();
76 # print line 66 if(re.match(">",line)): #header
77 if(re.match(">",line)): #header 67 header = line.split(">")[1].split(" ")[0]
78 seqID = "_".join(line.split(">")[1].split("_")[:-1]) 68 seqID = "_".join(line.split(">")[1].split("_")[:-1])
79 #seqID = line.split(">")[1].split("_")[0] 69 ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1));
80 start = int (re.search('\ \[(\d+)\ -', line).group(1)) 70 ORFend = int (re.search('-\ (\d+)\]',line).group(1));
81 end = int (re.search('-\ (\d+)\]',line).group(1)) 71 length = abs(ORFend - ORFstart);
82 length = abs(end - start)
83 if(seqID not in transcriptDict and old_seqID != ""): #new transcript
84 findlongestOrf(transcriptDict,old_seqID)
85 if seqID not in transcriptDict:
86 transcriptDict[seqID] = []
87 transcriptDict[seqID].append([start,end,length])
88 if(seqID not in lengthDict and old_seqID != ""): #new transcript
89 #write FASTA
90 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n")
91 #delete old dict entry
92 headerDict.pop(old_seqID, None)
93 seqDict.pop(old_seqID, None)
94 lengthDict.pop(old_seqID, None)
95 #if several longest sequences exist with the same length, the dictionary saves the last occuring.
96 if(seqID not in lengthDict or length >= lengthDict[seqID]):
97 headerDict[seqID] = line
98 lengthDict[seqID] = length
99 seqDict[seqID] = ""
100 skip = False
101 else:
102 skip = True
103 next
104 old_seqID = seqID
105 elif(skip):
106 next
107 else:
108 seqDict[seqID] += line
109 72
110 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) 73 if(seqID not in transcriptDict and old_seqID != ""): #new transcript
111 findlongestOrf(transcriptDict,old_seqID) 74 findlongestOrf(transcriptDict,old_seqID);
112 INPUT.close() 75
113 OUTPUT_FASTA.close() 76 if seqID not in transcriptDict:
114 OUTPUT_ORF_SUMMARY.close() 77 transcriptDict[seqID] = [];
78
79 transcriptDict[seqID].append([ORFstart,ORFend,length,header]);
80
81 if(seqID not in lengthDict and old_seqID != ""): #new transcript
82 #write FASTA
83 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n");
84 #delete old dict entry
85 headerDict.pop(old_seqID, None);
86 seqDict.pop(old_seqID, None);
87 lengthDict.pop(old_seqID, None);
88 #if several longest sequences exist with the same length, the dictionary saves the last occuring.
89 if(seqID not in lengthDict or length >= lengthDict[seqID]):
90 headerDict[seqID] = line;
91 lengthDict[seqID] = length;
92 seqDict[seqID] = "";
93 skip = False;
94 else:
95 skip = True;
96 next;
97 old_seqID = seqID;
98 elif(skip):
99 next;
100 else:
101 seqDict[seqID] += line;
102
103 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]);
104 findlongestOrf(transcriptDict,old_seqID);
105
106 INPUT.close();
107 OUTPUT_FASTA.close();
108 OUTPUT_ORF_SUMMARY.close();