view tools/fasta_tools/fasta_filter_by_length.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python
"""
Input: fasta, minimal length, maximal length
Output: fasta
Return sequences whose lengths are within the range.
"""

import sys, os

assert sys.version_info[:2] >= ( 2, 4 )

def stop_err( msg ):
    sys.stderr.write( msg )
    sys.exit()

def __main__():
    input_filename = sys.argv[1]
    try:
        min_length = int( sys.argv[2] )
    except:
        stop_err( "Minimal length of the return sequence requires a numerical value." )
    try:
        max_length = int( sys.argv[3] )
    except:
        stop_err( "Maximum length of the return sequence requires a numerical value." )
    output_filename = sys.argv[4]
    output_handle = open( output_filename, 'w' )
    tmp_size = 0 #-1
    tmp_buf = ''
    at_least_one = 0
    for line in file(input_filename):
        if not line or line.startswith('#'):
            continue
        if line[0] == '>':
            if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
                output_handle.write(tmp_buf)
                at_least_one = 1
            tmp_buf = line
            tmp_size = 0                                                       
        else:
            if max_length == 0 or tmp_size < max_length:
                tmp_size += len(line.rstrip('\r\n'))
                tmp_buf += line
    # final flush of buffer
    if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
        output_handle.write(tmp_buf.rstrip('\r\n'))
        at_least_one = 1
    output_handle.close()
    if at_least_one == 0:
        print "There is no sequence that falls within your range."

if __name__ == "__main__" : __main__()