annotate interproscan5/create_html_index.py @ 0:0da2847fc108 draft default tip

Uploaded
author si-datascience
date Thu, 24 May 2018 14:57:30 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
1 #!/usr/bin/env python
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
2
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
3 import os
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
4 import re
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
5 import sys
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
6
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
7 START = '''<html>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
8 <head>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
9 <style>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
10 a:link { text-decoration: none; color: red; }
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
11 a:visited { text-decoration: none; color: blue; }
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
12 a:hover { text-decoration: underline; color: green; } a:active { text-decoration: underline; color: green; }
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
13 </style>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
14 </head>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
15 <body>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
16 <h1>InterProScan result summary page</h1>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
17 '''
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
18
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
19 END = '''
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
20 </body>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
21 </html>
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
22 '''
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
23
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
24
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
25 def raw_mode(html_file, directory):
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
26 with open(html_file, 'w') as h:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
27 h.write(START)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
28 h.write('<ul>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
29 for filename in [f for f in sorted(os.listdir(directory)) if os.path.isfile(os.path.join(directory, f))]:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
30 h.write('<li><a href="%s"> %s </a></li>' % (filename, os.path.splitext(filename)[0]))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
31 h.write('</ul>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
32 h.write(END)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
33
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
34
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
35 def fix_name(name):
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
36 return re.sub('[&/]', '_', name)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
37
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
38
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
39 def cooked_mode(orfed_ids, tsv_file, html_file, directory):
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
40 name_freq = {}
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
41 with open(tsv_file) as f:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
42 for line in f:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
43 name = line.split("\t", 1)[0]
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
44 if orfed_ids:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
45 deorfed_name = re.sub('_\\d+$', '', name)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
46 else:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
47 deorfed_name = name
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
48
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
49 data = name_freq.get(deorfed_name, [])
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
50 if data:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
51 data[0] += 1
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
52 data[1][name] = data[1].get(name, 0) + 1
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
53 else:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
54 data = [1, {name: 1}]
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
55 name_freq[deorfed_name] = data
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
56
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
57 name_freq_sorted = [(x[0], (x[1][0], sorted(x[1][1].items(), key=lambda t: t[1], reverse=True)))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
58 for x in sorted(name_freq.items(), key=lambda t: t[1][0], reverse=True)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
59 ]
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
60
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
61 filename_dict = {}
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
62 for filename in [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
63 filename_dict[os.path.splitext(filename)[0]] = filename
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
64
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
65 with open(html_file, 'w') as h:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
66 h.write(START)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
67 h.write('<ol>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
68 for don, (freq, members) in name_freq_sorted:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
69 h.write('\n<li>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
70 if len(members) == 1:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
71 fn = fix_name(members[0][0])
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
72 if fn in filename_dict:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
73 h.write('<a href="%s"> %s (%d features)</a>' %
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
74 (filename_dict[fn], members[0][0], members[0][1]))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
75 del filename_dict[fn]
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
76 else:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
77 h.write('<a href="%s"> %s (%d features, broken link!)</a>' %
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
78 (members[0][0], members[0][0], members[0][1]))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
79 else:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
80 h.write('%s (%d features)' % (don, freq))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
81 h.write('\n<ul>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
82 for n, f in members:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
83 fn = fix_name(n)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
84 if fn in filename_dict:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
85 h.write('\n<li><a href="%s"> %s (%d features)</a></li>' % (filename_dict[fn], n, f))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
86 del filename_dict[fn]
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
87 else:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
88 h.write('\n<li><a href="%s"> %s (%d features, broken link!)</a></li>' % (n, n, f))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
89 h.write('</ul>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
90 h.write('</li>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
91
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
92 if len(filename_dict) > 0:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
93 h.write('<h2>Sequences without any features</h2>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
94 h.write('\n<ul>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
95 for n, p in sorted(filename_dict.items(), key=lambda t: t[0]):
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
96 h.write('\n<li><a href="%s">%s</li>' % (p, n))
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
97 h.write('</ul>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
98 h.write('</ol>')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
99 h.write(END)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
100
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
101
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
102 def main():
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
103 if len(sys.argv) == 3:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
104 raw_mode(sys.argv[1], sys.argv[2])
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
105 elif len(sys.argv) == 5:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
106 cooked_mode(sys.argv[1] != '0', sys.argv[2], sys.argv[3], sys.argv[4])
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
107 else:
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
108 print('Args must be "html_file directory" or "[0|1] tsv_file html_file directory"')
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
109 sys.exit(1)
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
110
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
111
0da2847fc108 Uploaded
si-datascience
parents:
diff changeset
112 main()