Mercurial > repos > mbernt > longorf
comparison getLongestORF.py @ 0:ec898924d8c7 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/blob/master/tools/longorf/ commit 8e118a4d24047e2c62912b962e854f789d6ff559
author | mbernt |
---|---|
date | Wed, 20 Jun 2018 11:02:06 -0400 |
parents | |
children | 1c4b24e9bb16 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ec898924d8c7 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 usage: getLongestORF.py input output.fas output.tab | |
5 | |
6 | |
7 input.fas: a amino acid fasta file of all open reading frames (ORF) listed by transcript (output of GalaxyTool "getorf") | |
8 output.fas: fasta file with all longest ORFs per transcript | |
9 output.tab: table with information about seqID, start, end, length, orientation, longest for all ORFs | |
10 | |
11 example: | |
12 | |
13 >253936-254394(+)_1 [28 - 63] | |
14 LTNYCQMVHNIL | |
15 >253936-254394(+)_2 [18 - 77] | |
16 HKLIDKLLPNGAQYFVKSTQ | |
17 >253936-254394(+)_3 [32 - 148] | |
18 QTTAKWCTIFCKKYPVAPFHTMYLNYAVTWHHRSLLVAV | |
19 >253936-254394(+)_4 [117 - 152] | |
20 LGIIVPSLLLCN | |
21 >248351-252461(+)_1 [14 - 85] | |
22 VLARKYPRCLSPSKKSPCQLRQRS | |
23 >248351-252461(+)_2 [21 - 161] | |
24 PGNTHDASAHRKSLRVNSDKEVKCLFTKNAASEHPDHKRRRVSEHVP | |
25 >248351-252461(+)_3 [89 - 202] | |
26 VPLHQECCIGAPRPQTTACVRACAMTNTPRSSMTSKTG | |
27 >248351-252461(+)_4 [206 - 259] | |
28 SRTTSGRQSVLSEKLWRR | |
29 >248351-252461(+)_5 [263 - 313] | |
30 CLSPLWVPCCSRHSCHG | |
31 """ | |
32 | |
33 import sys,re | |
34 | |
35 def findlongestOrf(transcriptDict,old_seqID): | |
36 #write for previous seqID | |
37 prevTranscript = transcriptDict[old_seqID] | |
38 i_max = 0 | |
39 #find longest orf in transcript | |
40 for i in range(0,len(prevTranscript)): | |
41 if(prevTranscript[i][2] >= prevTranscript[i_max][2]): | |
42 i_max = i | |
43 for i in range(0,len(prevTranscript)): | |
44 prevStart = prevTranscript[i][0] | |
45 prevEnd = prevTranscript[i][1] | |
46 prevLength = prevTranscript[i][2] | |
47 output = str(old_seqID) + "\t" + str(prevStart) + "\t" + str(prevEnd) + "\t" + str(prevLength) | |
48 if (end - start > 0): | |
49 output+="\tForward" | |
50 else: | |
51 output+="\tReverse" | |
52 if(i == i_max): | |
53 output += "\ty\n" | |
54 else: | |
55 output += "\tn\n" | |
56 OUTPUT_ORF_SUMMARY.write(output) | |
57 transcriptDict.pop(old_seqID, None) | |
58 return None | |
59 | |
60 INPUT = open(sys.argv[1],"r") | |
61 OUTPUT_FASTA = open(sys.argv[2],"w") | |
62 OUTPUT_ORF_SUMMARY = open(sys.argv[3],"w") | |
63 | |
64 seqID = "" | |
65 old_seqID = "" | |
66 lengthDict = {} | |
67 seqDict = {} | |
68 headerDict = {} | |
69 transcriptDict = {} | |
70 skip = False | |
71 | |
72 OUTPUT_ORF_SUMMARY.write("seqID\tstart\tend\tlength\torientation\tlongest\n") | |
73 | |
74 for line in INPUT: | |
75 line = line.strip() | |
76 # print line | |
77 if(re.match(">",line)): #header | |
78 seqID = "_".join(line.split(">")[1].split("_")[:-1]) | |
79 #seqID = line.split(">")[1].split("_")[0] | |
80 start = int (re.search('\ \[(\d+)\ -', line).group(1)) | |
81 end = int (re.search('-\ (\d+)\]',line).group(1)) | |
82 length = abs(end - start) | |
83 if(seqID not in transcriptDict and old_seqID != ""): #new transcript | |
84 findlongestOrf(transcriptDict,old_seqID) | |
85 if seqID not in transcriptDict: | |
86 transcriptDict[seqID] = [] | |
87 transcriptDict[seqID].append([start,end,length]) | |
88 if(seqID not in lengthDict and old_seqID != ""): #new transcript | |
89 #write FASTA | |
90 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]+"\n") | |
91 #delete old dict entry | |
92 headerDict.pop(old_seqID, None) | |
93 seqDict.pop(old_seqID, None) | |
94 lengthDict.pop(old_seqID, None) | |
95 #if several longest sequences exist with the same length, the dictionary saves the last occuring. | |
96 if(seqID not in lengthDict or length >= lengthDict[seqID]): | |
97 headerDict[seqID] = line | |
98 lengthDict[seqID] = length | |
99 seqDict[seqID] = "" | |
100 skip = False | |
101 else: | |
102 skip = True | |
103 next | |
104 old_seqID = seqID | |
105 elif(skip): | |
106 next | |
107 else: | |
108 seqDict[seqID] += line | |
109 | |
110 OUTPUT_FASTA.write(headerDict[old_seqID]+"\n"+seqDict[old_seqID]) | |
111 findlongestOrf(transcriptDict,old_seqID) | |
112 INPUT.close() | |
113 OUTPUT_FASTA.close() | |
114 OUTPUT_ORF_SUMMARY.close() |