view tools/metag_tools/blat_mapping.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

#!/usr/bin/env python

import os, sys

assert sys.version_info[:2] >= ( 2, 4 )

def reverse_complement(s):
    complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":"."}
    reversed_s = []
    for i in s:
        reversed_s.append(complement_dna[i])
    reversed_s.reverse()
    return "".join(reversed_s)
    
def __main__():
    nuc_index = {'a':0,'t':1,'c':2,'g':3,'n':4}
    coverage = {}        # key = (chrom, index)
    invalid_lines = 0
    invalid_chrom = 0
    infile = sys.argv[1]
    outfile = sys.argv[2]

    for i, line in enumerate( open( infile ) ):
        line = line.rstrip('\r\n')
        if not line or line.startswith('#'):
            continue
        fields = line.split()
        if len(fields) < 21:                # standard number of pslx columns
            invalid_lines += 1
            continue 
        if not fields[0].isdigit():
            invalid_lines += 1
            continue
        chrom = fields[13]
        if not chrom.startswith( 'chr' ):
            invalid_lines += 1
            invalid_chrom += 1
            continue
        try:
            block_count = int(fields[17])
        except:
            invalid_lines += 1
            continue
        block_size = fields[18].split(',')
        chrom_start = fields[20].split(',')

        for j in range( block_count ):
            try:
                this_block_size = int(block_size[j])
                this_chrom_start = int(chrom_start[j])
            except:
                invalid_lines += 1
                break
            # brut force coverage                
            for k in range( this_block_size ):
                cur_index = this_chrom_start + k
                if coverage.has_key( ( chrom, cur_index ) ):
                    coverage[(chrom, cur_index)] += 1
                else:
                    coverage[(chrom, cur_index)] = 1
                
    # generate a index file
    outputfh = open(outfile, 'w')
    keys = coverage.keys()
    keys.sort()
    previous_chrom = ''
    for i in keys:
        (chrom, location) = i
        sum = coverage[(i)]
        if chrom != previous_chrom:
            outputfh.write( 'variableStep chrom=%s\n' % ( chrom ) )
            previous_chrom = chrom
        outputfh.write( "%s\t%s\n" % ( location, sum ) )
    outputfh.close()
    
    if invalid_lines:
        invalid_msg = "Skipped %d invalid lines" % invalid_lines
        if invalid_chrom:
            invalid_msg += ", including %d lines with chrom id errors which must begin with 'chr' to map correctly to the UCSC Genome Browser. "
        
if __name__ == '__main__': __main__()