Mercurial > repos > mbernt > longorf
comparison getLongestORF.py @ 1:1c4b24e9bb16 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 5be33ea99532ab3abb000564af4c63c81c4ccd87
author | mbernt |
---|---|
date | Mon, 16 Jul 2018 11:01:52 -0400 |
parents | ec898924d8c7 |
children |
comparison
equal
deleted
inserted
replaced
0:ec898924d8c7 | 1:1c4b24e9bb16 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 """ | 3 #example: |
4 usage: getLongestORF.py input output.fas output.tab | 4 #>STRG.1.1(-)_1 [10 - 69] |
5 #GGNHHTLGGKKTFSYTHPPC | |
6 #>STRG.1.1(-)_2 [3 - 80] | |
7 #FLRGEPPHIGGKKDIFLHPPTLLKGR | |
5 | 8 |
9 #output1: fasta file with all longest ORFs per transcript | |
10 #output2: table with information about seqID, transcript, start, end, strand, length, sense, longest? for all ORFs | |
6 | 11 |
7 input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") | 12 import sys,re; |
8 output.fas: fasta file with all longest ORFs per transcript | |
9 output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs | |
10 | |
11 example: | |
12 | |
13 >253936-254394(+)_1 [28 - 63] | |
14 LTNYCQMVHNIL | |
15 >253936-254394(+)_2 [18 - 77] | |
16 HKLIDKLLPNGAQYFVKSTQ | |
17 >253936-254394(+)_3 [32 - 148] | |
18 QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV | |
19 >253936-254394(+)_4 [117 - 152] | |
20 LGIIVPSLLLCN | |
21 >248351-252461(+)_1 [14 - 85] | |
22 VLARKYPRCLSPSKKSPCQLRQRS | |
23 >248351-252461(+)_2 [21 - 161] | |
24 PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP | |
25 >248351-252461(+)_3 [89 - 202] | |
26 VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG | |
27 >248351-252461(+)_4 [206 - 259] | |
28 SRTTSGRQSVLSEKLWRR | |
29 >248351-252461(+)_5 [263 - 313] | |
30 CLSPLWVPCCSRHSCHG | |
31 """ | |
32 | |
33 import sys,re | |
34 | 13 |
35 def findlongestOrf(transcriptDict,old_seqID): | 14 def findlongestOrf(transcriptDict,old_seqID): |
36 #write for previous seqID | 15 #write for previous seqID |
37 prevTranscript = transcriptDict[old_seqID] | 16 prevTranscript = transcriptDict[old_seqID]; |
38 i_max = 0 | 17 i_max = 0; |
39 #find longest orf in transcript | 18 transcript = old_seqID.split("(")[0] |
40 for i in range(0,len(prevTranscript)): | |
41 if(prevTranscript[i][2] >= prevTranscript[i_max][2]): | |
42 i_max = i | |
43 for i in range(0,len(prevTranscript)): | |
44 prevStart = prevTranscript[i][0] | |
45 prevEnd = prevTranscript[i][1] | |
46 prevLength = prevTranscript[i][2] | |
47 output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) | |
48 if (end - start > 0): | |
49 output+="\tForward" | |
50 else: | |
51 output+="\tReverse" | |
52 if(i == i_max): | |
53 output += "\ty\n" | |
54 else: | |
55 output += "\tn\n" | |
56 OUTPUT_ORF_SUMMARY.write(output) | |
57 transcriptDict.pop(old_seqID, None) | |
58 return None | |
59 | 19 |
60 INPUT = open(sys.argv[1],"r") | 20 #find longest orf in transcript |
61 OUTPUT_FASTA = open(sys.argv[2],"w") | 21 for i in range(0,len(prevTranscript)): |
62 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") | 22 if(prevTranscript[i][2] >= prevTranscript[i_max][2]): |
23 i_max = i; | |
63 | 24 |
64 seqID = "" | 25 for i in range(0,len(prevTranscript)): |
65 old_seqID = "" | 26 prevORFstart = prevTranscript[i][0]; |
66 lengthDict = {} | 27 prevORFend = prevTranscript[i][1]; |
67 seqDict = {} | 28 prevORFlength = prevTranscript[i][2]; |
68 headerDict = {} | 29 header = prevTranscript[i][3]; |
69 transcriptDict = {} | 30 strand = re.search('\(([+-]+)\)',header).group(1); |
70 skip = False | 31 |
32 output = str(header) + "\t" + str(transcript) + "\t" + str(prevORFstart) + "\t" + str(prevORFend) + "\t" + str(prevORFlength) + "\t" + str(strand); | |
33 if (prevORFend - prevORFstart > 0): | |
34 output+="\tnormal"; | |
35 else: | |
36 output+="\treverse_sense"; | |
37 if(i == i_max): | |
38 output += "\ty\n"; | |
39 else: | |
40 output += "\tn\n"; | |
71 | 41 |
72 OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") | 42 OUTPUT_ORF_SUMMARY.write(output); |
43 | |
44 transcriptDict.pop(old_seqID, None); | |
45 return None; | |
46 | |
47 #----------------------------------------------------------------------------------------------------- | |
48 | |
49 INPUT = open(sys.argv[1],"r"); | |
50 OUTPUT_FASTA = open(sys.argv[2],"w"); | |
51 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w"); | |
52 | |
53 seqID = ""; | |
54 old_seqID = ""; | |
55 lengthDict = {}; | |
56 seqDict = {}; | |
57 headerDict = {}; | |
58 transcriptDict = {}; | |
59 | |
60 skip = False; | |
61 | |
62 OUTPUT_ORF_SUMMARY.write("seqID\ttranscript\torf_start\torf_end\tlength\tstrand\tsense\tlongest\n"); | |
73 | 63 |
74 for line in INPUT: | 64 for line in INPUT: |
75 line = line.strip() | 65 line = line.strip(); |
76 # print line | 66 if(re.match(">",line)): #header |
77 if(re.match(">",line)): #header | 67 header = line.split(">")[1].split(" ")[0] |
78 seqID = "_".join(line.split(">")[1].split("_")[:-1]) | 68 seqID = "_".join(line.split(">")[1].split("_")[:-1]) |
79 #seqID = line.split(">")[1].split("_")[0] | 69 ORFstart = int (re.search('\ \[(\d+)\ -', line).group(1)); |
80 start = int (re.search('\ \[(\d+)\ -', line).group(1)) | 70 ORFend = int (re.search('-\ (\d+)\]',line).group(1)); |
81 end = int (re.search('-\ (\d+)\]',line).group(1)) | 71 length = abs(ORFend - ORFstart); |
82 length = abs(end - start) | |
83 if(seqID not in transcriptDict and old_seqID != ""): #new transcript | |
84 findlongestOrf(transcriptDict,old_seqID) | |
85 if seqID not in transcriptDict: | |
86 transcriptDict[seqID] = [] | |
87 transcriptDict[seqID].append([start,end,length]) | |
88 if(seqID not in lengthDict and old_seqID != ""): #new transcript | |
89 #write FASTA | |
90 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") | |
91 #delete old dict entry | |
92 headerDict.pop(old_seqID, None) | |
93 seqDict.pop(old_seqID, None) | |
94 lengthDict.pop(old_seqID, None) | |
95 #if several longest sequences exist with the same length, the dictionary saves the last occuring. | |
96 if(seqID not in lengthDict or length >= lengthDict[seqID]): | |
97 headerDict[seqID] = line | |
98 lengthDict[seqID] = length | |
99 seqDict[seqID] = "" | |
100 skip = False | |
101 else: | |
102 skip = True | |
103 next | |
104 old_seqID = seqID | |
105 elif(skip): | |
106 next | |
107 else: | |
108 seqDict[seqID] += line | |
109 | 72 |
110 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) | 73 if(seqID not in transcriptDict and old_seqID != ""): #new transcript |
111 findlongestOrf(transcriptDict,old_seqID) | 74 findlongestOrf(transcriptDict,old_seqID); |
112 INPUT.close() | 75 |
113 OUTPUT_FASTA.close() | 76 if seqID not in transcriptDict: |
114 OUTPUT_ORF_SUMMARY.close() | 77 transcriptDict[seqID] = []; |
78 | |
79 transcriptDict[seqID].append([ORFstart,ORFend,length,header]); | |
80 | |
81 if(seqID not in lengthDict and old_seqID != ""): #new transcript | |
82 #write FASTA | |
83 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n"); | |
84 #delete old dict entry | |
85 headerDict.pop(old_seqID, None); | |
86 seqDict.pop(old_seqID, None); | |
87 lengthDict.pop(old_seqID, None); | |
88 #if several longest sequences exist with the same length, the dictionary saves the last occuring. | |
89 if(seqID not in lengthDict or length >= lengthDict[seqID]): | |
90 headerDict[seqID] = line; | |
91 lengthDict[seqID] = length; | |
92 seqDict[seqID] = ""; | |
93 skip = False; | |
94 else: | |
95 skip = True; | |
96 next; | |
97 old_seqID = seqID; | |
98 elif(skip): | |
99 next; | |
100 else: | |
101 seqDict[seqID] += line; | |
102 | |
103 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]); | |
104 findlongestOrf(transcriptDict,old_seqID); | |
105 | |
106 INPUT.close(); | |
107 OUTPUT_FASTA.close(); | |
108 OUTPUT_ORF_SUMMARY.close(); |