view tools/filters/trimmer.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python

import sys
import optparse

def stop_err( msg ):
    sys.stderr.write( msg )
    sys.exit()

def main():
    usage = """%prog [options]
    
options (listed below) default to 'None' if omitted
    """
    parser = optparse.OptionParser(usage=usage)
    
    parser.add_option(
        '-a','--ascii',
        dest='ascii',
        action='store_true',
        default = False,
        help='Use ascii codes to defined ignored beginnings instead of raw characters')
        
    parser.add_option(
        '-q','--fastq',
        dest='fastq',
        action='store_true',
        default = False,
        help='The input data in fastq format. It selected the script skips every even line since they contain sequence ids')

    parser.add_option(
        '-i','--ignore',
        dest='ignore',
        help='A comma separated list on ignored beginnings (e.g., ">,@"), or its ascii codes (e.g., "60,42") if option -a is enabled')

    parser.add_option(
        '-s','--start',
        dest='start',
        default = '0',
        help='Trim from beginning to here (1-based)')

    parser.add_option(
        '-e','--end',
        dest='end',
        default = '0',
        help='Trim from here to the ned (1-based)')

    parser.add_option(
        '-f','--file',
        dest='input_txt',
        default = False,
        help='Name of file to be chopped. STDIN is default')
            
    parser.add_option(
        '-c','--column',
        dest='col',
        default = '0',
        help='Column to chop. If 0 = chop the whole line')
       

    options, args = parser.parse_args()
    invalid_starts = []

    if options.input_txt:
		infile = open ( options.input_txt, 'r')
    else:
    	infile = sys.stdin
    	
    if options.ignore and options.ignore != "None":
        invalid_starts = options.ignore.split(',')
        
    if options.ascii and options.ignore and options.ignore != "None":
        for i, item in enumerate( invalid_starts ):
            invalid_starts[i] = chr( int( item ) )

    col = int( options.col )
 
    for i, line in enumerate( infile ):
        line = line.rstrip( '\r\n' )
        if line:
            
            if options.fastq and i % 2 == 0:
                print line
                continue
                

            if line[0] not in invalid_starts:
                if col == 0:
                    if int( options.end ) > 0:
                        line = line[ int( options.start )-1 : int( options.end ) ]
                    else:
                        line = line[ int( options.start )-1 : ]
                else:
                    fields = line.split( '\t' )
                    if col-1 > len( fields ):
                        stop_err('Column %d does not exist. Check input parameters\n' % col)
                        
                    if int( options.end ) > 0:
                        fields[col - 1] = fields[col - 1][ int( options.start )-1 : int( options.end ) ]
                    else:
                        fields[col - 1] = fields[col - 1][ int( options.start )-1 : ]
                    line = '\t'.join(fields)
            print line   

if __name__ == "__main__": main()