Mercurial > repos > si-datascience > interps_test
diff interproscan5/create_html_index.py @ 0:0da2847fc108 draft default tip
Uploaded
author | si-datascience |
---|---|
date | Thu, 24 May 2018 14:57:30 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/interproscan5/create_html_index.py Thu May 24 14:57:30 2018 -0400 @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +import os +import re +import sys + +START = '''<html> +<head> +<style> + a:link { text-decoration: none; color: red; } + a:visited { text-decoration: none; color: blue; } + a:hover { text-decoration: underline; color: green; } a:active { text-decoration: underline; color: green; } +</style> +</head> +<body> +<h1>InterProScan result summary page</h1> +''' + +END = ''' +</body> +</html> +''' + + +def raw_mode(html_file, directory): + with open(html_file, 'w') as h: + h.write(START) + h.write('<ul>') + for filename in [f for f in sorted(os.listdir(directory)) if os.path.isfile(os.path.join(directory, f))]: + h.write('<li><a href="%s"> %s </a></li>' % (filename, os.path.splitext(filename)[0])) + h.write('</ul>') + h.write(END) + + +def fix_name(name): + return re.sub('[&/]', '_', name) + + +def cooked_mode(orfed_ids, tsv_file, html_file, directory): + name_freq = {} + with open(tsv_file) as f: + for line in f: + name = line.split("\t", 1)[0] + if orfed_ids: + deorfed_name = re.sub('_\\d+$', '', name) + else: + deorfed_name = name + + data = name_freq.get(deorfed_name, []) + if data: + data[0] += 1 + data[1][name] = data[1].get(name, 0) + 1 + else: + data = [1, {name: 1}] + name_freq[deorfed_name] = data + + name_freq_sorted = [(x[0], (x[1][0], sorted(x[1][1].items(), key=lambda t: t[1], reverse=True))) + for x in sorted(name_freq.items(), key=lambda t: t[1][0], reverse=True) + ] + + filename_dict = {} + for filename in [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]: + filename_dict[os.path.splitext(filename)[0]] = filename + + with open(html_file, 'w') as h: + h.write(START) + h.write('<ol>') + for don, (freq, members) in name_freq_sorted: + h.write('\n<li>') + if len(members) == 1: + fn = fix_name(members[0][0]) + if fn in filename_dict: + h.write('<a href="%s"> %s (%d features)</a>' % + (filename_dict[fn], members[0][0], members[0][1])) + del filename_dict[fn] + else: + h.write('<a href="%s"> %s (%d features, broken link!)</a>' % + (members[0][0], members[0][0], members[0][1])) + else: + h.write('%s (%d features)' % (don, freq)) + h.write('\n<ul>') + for n, f in members: + fn = fix_name(n) + if fn in filename_dict: + h.write('\n<li><a href="%s"> %s (%d features)</a></li>' % (filename_dict[fn], n, f)) + del filename_dict[fn] + else: + h.write('\n<li><a href="%s"> %s (%d features, broken link!)</a></li>' % (n, n, f)) + h.write('</ul>') + h.write('</li>') + + if len(filename_dict) > 0: + h.write('<h2>Sequences without any features</h2>') + h.write('\n<ul>') + for n, p in sorted(filename_dict.items(), key=lambda t: t[0]): + h.write('\n<li><a href="%s">%s</li>' % (p, n)) + h.write('</ul>') + h.write('</ol>') + h.write(END) + + +def main(): + if len(sys.argv) == 3: + raw_mode(sys.argv[1], sys.argv[2]) + elif len(sys.argv) == 5: + cooked_mode(sys.argv[1] != '0', sys.argv[2], sys.argv[3], sys.argv[4]) + else: + print('Args must be "html_file directory" or "[0|1] tsv_file html_file directory"') + sys.exit(1) + + +main()