Mercurial > repos > devteam > cluster

#!/usr/bin/env python
"""
Cluster regions of intervals.

usage: %prog in_file out_file
    -1, --cols1=N,N,N,N: Columns for start, end, strand in file
    -d, --distance=N: Maximum distance between clustered intervals
    -v, --overlap=N: Minimum overlap require (negative distance)
    -m, --minregions=N: Minimum regions per cluster
    -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum
"""
from __future__ import print_function

import fileinput
import sys

from bx.cookbook import doc_optparse
from bx.intervals.io import GenomicInterval, NiceReaderWrapper
from bx.intervals.operations.find_clusters import find_clusters
from bx.tabular.io import ParseError
from galaxy.tools.util.galaxyops import fail, parse_cols_arg, skipped

assert sys.version_info[:2] >= ( 2, 4 )


def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    f1 = open( in_fname, "r" )
    out_file = open( out_fname, "w" )

    # If "merge"
    if output == 1:
        fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write( "%s\n" % "\t".join( fields ) )

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write( "%s\n" % line.rstrip( "\n\r" ) )

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval( g1, fileline.split("\t"),
                                                            g1.chrom_col,
                                                            g1.start_col,
                                                            g1.end_col,
                                                            g1.strand_col,
                                                            g1.default_strand,
                                                            g1.fix_strand )
                    except Exception as exc:
                        print(str( exc ), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write( "%s\n" % outinterval )

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))


if __name__ == "__main__":
    main()
author	devteam
date	Thu, 22 Jun 2017 18:40:19 -0400
parents	765ceb06c3e2
children