view tools/filters/axt_to_lav.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

#!/usr/bin/env python
"""
Application to convert AXT file to LAV file
-------------------------------------------

:Author: Bob Harris (rsharris@bx.psu.edu)
:Version: $Revision: $

The application reads an AXT file from standard input and writes a LAV file to
standard out;  some statistics are written to standard error.
"""

import sys, copy
from galaxy import eggs
import pkg_resources
pkg_resources.require( "bx-python" )
import bx.align.axt
import bx.align.lav

assert sys.version_info[:2] >= ( 2, 4 )

def usage(s=None):
    message = """
axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
  Each spec is of the form seq_file[:species_name]:lengths_file.

  seq_file should be a format string for the file names for the individual
  sequences, with %s to be replaced by the alignment's src field.  For example,
  "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
  etc.

  species_name is optional.  If present, it is prepended to the alignment's src
  field.

  Lengths files provide the length of each chromosome (lav format needs this
  information but axt file does not contain it).  The format is a series of
  lines of the form
    <chromosome name> <length>
  The chromosome field in each axt block must match some <chromosome name> in
  the lengths file.
"""
    if (s == None): sys.exit (message)
    else:           sys.exit ("%s\n%s" % (s,message))


def main():
    global debug

    # parse the command line

    primary   = None
    secondary = None
    silent    = False

    # pick off options

    args = sys.argv[1:]
    seq_file2 = open(args.pop(-1),'w')
    seq_file1 = open(args.pop(-1),'w')
    lav_out = args.pop(-1)
    axt_in = args.pop(-1)
    while (len(args) > 0):
        arg = args.pop(0)
        val = None
        fields = arg.split("=",1)
        if (len(fields) == 2):
            arg = fields[0]
            val = fields[1]
            if (val == ""):
                usage("missing a value in %s=" % arg)

        if (arg == "--silent") and (val == None):
            silent = True
        elif (primary == None) and (val == None):
            primary = arg
        elif (secondary == None) and (val == None):
            secondary = arg
        else:
            usage("unknown argument: %s" % arg)

    if (primary == None):
        usage("missing primary file name and length")

    if (secondary == None):
        usage("missing secondary file name and length")

    try:
        (primaryFile,primary,primaryLengths) = parse_spec(primary)
    except:
        usage("bad primary spec (must be seq_file[:species_name]:lengths_file")

    try:
        (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
    except:
        usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")

    # read the lengths

    speciesToLengths = {}
    speciesToLengths[primary]   = read_lengths (primaryLengths)
    speciesToLengths[secondary] = read_lengths (secondaryLengths)

    # read the alignments

    out = bx.align.lav.Writer(open(lav_out,'w'), \
            attributes = { "name_format_1" : primaryFile,
                           "name_format_2" : secondaryFile })

    axtsRead = 0
    axtsWritten = 0
    for axtBlock in bx.align.axt.Reader(open(axt_in), \
            species_to_lengths = speciesToLengths,
            species1           = primary,
            species2           = secondary,
            support_ids        = True):
        axtsRead += 1
        out.write (axtBlock)
        primary_c = axtBlock.get_component_by_src_start(primary)
        secondary_c = axtBlock.get_component_by_src_start(secondary)
        
        print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
        print >>seq_file1,primary_c.text
        print >>seq_file1
        
        print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
        print >>seq_file2,secondary_c.text
        print >>seq_file2
        axtsWritten += 1

    out.close()
    seq_file1.close()
    seq_file2.close()

    if (not silent):
        sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))

def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
    fields = spec.split(":")
    if   (len(fields) == 2): return (fields[0],"",fields[1])
    elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
    else:                    raise ValueError

def read_lengths (fileName):

    chromToLength = {}

    f = file (fileName, "r")

    for lineNumber,line in enumerate(f):
        line = line.strip()
        if (line == ""): continue
        if (line.startswith("#")): continue

        fields = line.split ()
        if (len(fields) != 2):
            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)

        chrom = fields[0]
        try:
            length = int(fields[1])
        except:
            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)

        if (chrom in chromToLength):
            raise "%s appears more than once (%s:%d): %s" \
                % (chrom,fileName,lineNumber)

        chromToLength[chrom] = length

    f.close ()

    return chromToLength


if __name__ == "__main__": main()