view interproscan5/create_html_index.py @ 0:0da2847fc108 draft default tip

Uploaded
author si-datascience
date Thu, 24 May 2018 14:57:30 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

import os
import re
import sys

START = '''<html>
<head>
<style>
    a:link { text-decoration: none; color: red; }
    a:visited { text-decoration: none; color: blue; }
    a:hover { text-decoration: underline; color: green; } a:active { text-decoration: underline; color: green; }
</style>
</head>
<body>
<h1>InterProScan result summary page</h1>
'''

END = '''
</body>
</html>
'''


def raw_mode(html_file, directory):
    with open(html_file, 'w') as h:
        h.write(START)
        h.write('<ul>')
        for filename in [f for f in sorted(os.listdir(directory)) if os.path.isfile(os.path.join(directory, f))]:
            h.write('<li><a href="%s"> %s </a></li>' % (filename, os.path.splitext(filename)[0]))
        h.write('</ul>')
        h.write(END)


def fix_name(name):
    return re.sub('[&/]', '_', name)


def cooked_mode(orfed_ids, tsv_file, html_file, directory):
    name_freq = {}
    with open(tsv_file) as f:
        for line in f:
            name = line.split("\t", 1)[0]
            if orfed_ids:
                deorfed_name = re.sub('_\\d+$', '', name)
            else:
                deorfed_name = name

            data = name_freq.get(deorfed_name, [])
            if data:
                data[0] += 1
                data[1][name] = data[1].get(name, 0) + 1
            else:
                data = [1, {name: 1}]
            name_freq[deorfed_name] = data

    name_freq_sorted = [(x[0], (x[1][0], sorted(x[1][1].items(), key=lambda t: t[1], reverse=True)))
                        for x in sorted(name_freq.items(), key=lambda t: t[1][0], reverse=True)
                        ]

    filename_dict = {}
    for filename in [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]:
        filename_dict[os.path.splitext(filename)[0]] = filename

    with open(html_file, 'w') as h:
        h.write(START)
        h.write('<ol>')
        for don, (freq, members) in name_freq_sorted:
            h.write('\n<li>')
            if len(members) == 1:
                fn = fix_name(members[0][0])
                if fn in filename_dict:
                    h.write('<a href="%s"> %s (%d features)</a>' %
                            (filename_dict[fn], members[0][0], members[0][1]))
                    del filename_dict[fn]
                else:
                    h.write('<a href="%s"> %s (%d features, broken link!)</a>' %
                            (members[0][0], members[0][0], members[0][1]))
            else:
                h.write('%s (%d features)' % (don, freq))
                h.write('\n<ul>')
                for n, f in members:
                    fn = fix_name(n)
                    if fn in filename_dict:
                        h.write('\n<li><a href="%s"> %s (%d features)</a></li>' % (filename_dict[fn], n, f))
                        del filename_dict[fn]
                    else:
                        h.write('\n<li><a href="%s"> %s (%d features, broken link!)</a></li>' % (n, n, f))
                h.write('</ul>')
            h.write('</li>')

        if len(filename_dict) > 0:
            h.write('<h2>Sequences without any features</h2>')
            h.write('\n<ul>')
            for n, p in sorted(filename_dict.items(), key=lambda t: t[0]):
                h.write('\n<li><a href="%s">%s</li>' % (p, n))
            h.write('</ul>')
        h.write('</ol>')
        h.write(END)


def main():
    if len(sys.argv) == 3:
        raw_mode(sys.argv[1], sys.argv[2])
    elif len(sys.argv) == 5:
        cooked_mode(sys.argv[1] != '0', sys.argv[2], sys.argv[3], sys.argv[4])
    else:
        print('Args must be "html_file directory" or "[0|1] tsv_file html_file directory"')
        sys.exit(1)


main()