diff tools/filters/axt_to_lav.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/axt_to_lav.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+"""
+Application to convert AXT file to LAV file
+-------------------------------------------
+
+:Author: Bob Harris (rsharris@bx.psu.edu)
+:Version: $Revision: $
+
+The application reads an AXT file from standard input and writes a LAV file to
+standard out;  some statistics are written to standard error.
+"""
+
+import sys, copy
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "bx-python" )
+import bx.align.axt
+import bx.align.lav
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def usage(s=None):
+    message = """
+axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
+  Each spec is of the form seq_file[:species_name]:lengths_file.
+
+  seq_file should be a format string for the file names for the individual
+  sequences, with %s to be replaced by the alignment's src field.  For example,
+  "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
+  etc.
+
+  species_name is optional.  If present, it is prepended to the alignment's src
+  field.
+
+  Lengths files provide the length of each chromosome (lav format needs this
+  information but axt file does not contain it).  The format is a series of
+  lines of the form
+    <chromosome name> <length>
+  The chromosome field in each axt block must match some <chromosome name> in
+  the lengths file.
+"""
+    if (s == None): sys.exit (message)
+    else:           sys.exit ("%s\n%s" % (s,message))
+
+
+def main():
+    global debug
+
+    # parse the command line
+
+    primary   = None
+    secondary = None
+    silent    = False
+
+    # pick off options
+
+    args = sys.argv[1:]
+    seq_file2 = open(args.pop(-1),'w')
+    seq_file1 = open(args.pop(-1),'w')
+    lav_out = args.pop(-1)
+    axt_in = args.pop(-1)
+    while (len(args) > 0):
+        arg = args.pop(0)
+        val = None
+        fields = arg.split("=",1)
+        if (len(fields) == 2):
+            arg = fields[0]
+            val = fields[1]
+            if (val == ""):
+                usage("missing a value in %s=" % arg)
+
+        if (arg == "--silent") and (val == None):
+            silent = True
+        elif (primary == None) and (val == None):
+            primary = arg
+        elif (secondary == None) and (val == None):
+            secondary = arg
+        else:
+            usage("unknown argument: %s" % arg)
+
+    if (primary == None):
+        usage("missing primary file name and length")
+
+    if (secondary == None):
+        usage("missing secondary file name and length")
+
+    try:
+        (primaryFile,primary,primaryLengths) = parse_spec(primary)
+    except:
+        usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
+
+    try:
+        (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
+    except:
+        usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
+
+    # read the lengths
+
+    speciesToLengths = {}
+    speciesToLengths[primary]   = read_lengths (primaryLengths)
+    speciesToLengths[secondary] = read_lengths (secondaryLengths)
+
+    # read the alignments
+
+    out = bx.align.lav.Writer(open(lav_out,'w'), \
+            attributes = { "name_format_1" : primaryFile,
+                           "name_format_2" : secondaryFile })
+
+    axtsRead = 0
+    axtsWritten = 0
+    for axtBlock in bx.align.axt.Reader(open(axt_in), \
+            species_to_lengths = speciesToLengths,
+            species1           = primary,
+            species2           = secondary,
+            support_ids        = True):
+        axtsRead += 1
+        out.write (axtBlock)
+        primary_c = axtBlock.get_component_by_src_start(primary)
+        secondary_c = axtBlock.get_component_by_src_start(secondary)
+        
+        print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
+        print >>seq_file1,primary_c.text
+        print >>seq_file1
+        
+        print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
+        print >>seq_file2,secondary_c.text
+        print >>seq_file2
+        axtsWritten += 1
+
+    out.close()
+    seq_file1.close()
+    seq_file2.close()
+
+    if (not silent):
+        sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
+
+def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
+    fields = spec.split(":")
+    if   (len(fields) == 2): return (fields[0],"",fields[1])
+    elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
+    else:                    raise ValueError
+
+def read_lengths (fileName):
+
+    chromToLength = {}
+
+    f = file (fileName, "r")
+
+    for lineNumber,line in enumerate(f):
+        line = line.strip()
+        if (line == ""): continue
+        if (line.startswith("#")): continue
+
+        fields = line.split ()
+        if (len(fields) != 2):
+            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
+
+        chrom = fields[0]
+        try:
+            length = int(fields[1])
+        except:
+            raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
+
+        if (chrom in chromToLength):
+            raise "%s appears more than once (%s:%d): %s" \
+                % (chrom,fileName,lineNumber)
+
+        chromToLength[chrom] = length
+
+    f.close ()
+
+    return chromToLength
+
+
+if __name__ == "__main__": main()
+