view tools/filters/gff/gff_filter_by_feature_count.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python
"""
Filter a gff file using a criterion based on feature counts for a transcript.

Usage:
%prog input_name output_name feature_name condition
"""
import sys
from galaxy import eggs
from galaxy.datatypes.util.gff_util import GFFReaderWrapper
from bx.intervals.io import GenomicInterval

# Valid operators, ordered so that complex operators (e.g. '>=') are
# recognized before simple operators (e.g. '>')
ops = [
    '>=',
    '<=',
    '<',
    '>',
    '==',
    '!='
]

# Escape sequences for valid operators.
mapped_ops = {
    '__ge__': ops[0],
    '__le__': ops[1],
    '__lt__': ops[2],
    '__gt__': ops[3],
    '__eq__': ops[4],
    '__ne__': ops[5],
}


def __main__():
    # Get args.
    input_name = sys.argv[1]
    output_name = sys.argv[2]
    feature_name = sys.argv[3]
    condition = sys.argv[4]
    
    # Unescape operations in condition str.
    for key, value in mapped_ops.items():
        condition = condition.replace( key, value )
    
    # Error checking: condition should be of the form <operator><number>
    for op in ops:
        if op in condition:
            empty, number_str = condition.split( op )
            try:
                number = float( number_str )
            except:
                number = None
            if empty != "" or not number:
                print >> sys.stderr, "Invalid condition: %s, cannot filter." % condition
                return
            break

    # Do filtering.
    kept_features = 0
    skipped_lines = 0
    first_skipped_line = 0
    out = open( output_name, 'w' )
    for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ):
        if not isinstance( feature, GenomicInterval ):
            continue
        count = 0
        for interval in feature.intervals:
            if interval.feature == feature_name:
                count += 1
        if eval( '%s %s' % ( count, condition ) ):
            # Keep feature.
            for interval in feature.intervals:
                out.write( "\t".join(interval.fields) + '\n' )
            kept_features += 1

    # Needed because i is 0-based but want to display stats using 1-based.
    i += 1

    # Clean up.
    out.close()
    info_msg = "%i of %i features kept (%.2f%%) using condition %s.  " % \
        ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition )
    if skipped_lines > 0:
        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
    print info_msg

if __name__ == "__main__": __main__()