Mercurial > repos > xuebing > sharplabtool
diff tools/filters/axt_to_lav.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/axt_to_lav.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,176 @@ +#!/usr/bin/env python +""" +Application to convert AXT file to LAV file +------------------------------------------- + +:Author: Bob Harris (rsharris@bx.psu.edu) +:Version: $Revision: $ + +The application reads an AXT file from standard input and writes a LAV file to +standard out; some statistics are written to standard error. +""" + +import sys, copy +from galaxy import eggs +import pkg_resources +pkg_resources.require( "bx-python" ) +import bx.align.axt +import bx.align.lav + +assert sys.version_info[:2] >= ( 2, 4 ) + +def usage(s=None): + message = """ +axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file + Each spec is of the form seq_file[:species_name]:lengths_file. + + seq_file should be a format string for the file names for the individual + sequences, with %s to be replaced by the alignment's src field. For example, + "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib", + etc. + + species_name is optional. If present, it is prepended to the alignment's src + field. + + Lengths files provide the length of each chromosome (lav format needs this + information but axt file does not contain it). The format is a series of + lines of the form + <chromosome name> <length> + The chromosome field in each axt block must match some <chromosome name> in + the lengths file. +""" + if (s == None): sys.exit (message) + else: sys.exit ("%s\n%s" % (s,message)) + + +def main(): + global debug + + # parse the command line + + primary = None + secondary = None + silent = False + + # pick off options + + args = sys.argv[1:] + seq_file2 = open(args.pop(-1),'w') + seq_file1 = open(args.pop(-1),'w') + lav_out = args.pop(-1) + axt_in = args.pop(-1) + while (len(args) > 0): + arg = args.pop(0) + val = None + fields = arg.split("=",1) + if (len(fields) == 2): + arg = fields[0] + val = fields[1] + if (val == ""): + usage("missing a value in %s=" % arg) + + if (arg == "--silent") and (val == None): + silent = True + elif (primary == None) and (val == None): + primary = arg + elif (secondary == None) and (val == None): + secondary = arg + else: + usage("unknown argument: %s" % arg) + + if (primary == None): + usage("missing primary file name and length") + + if (secondary == None): + usage("missing secondary file name and length") + + try: + (primaryFile,primary,primaryLengths) = parse_spec(primary) + except: + usage("bad primary spec (must be seq_file[:species_name]:lengths_file") + + try: + (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary) + except: + usage("bad secondary spec (must be seq_file[:species_name]:lengths_file") + + # read the lengths + + speciesToLengths = {} + speciesToLengths[primary] = read_lengths (primaryLengths) + speciesToLengths[secondary] = read_lengths (secondaryLengths) + + # read the alignments + + out = bx.align.lav.Writer(open(lav_out,'w'), \ + attributes = { "name_format_1" : primaryFile, + "name_format_2" : secondaryFile }) + + axtsRead = 0 + axtsWritten = 0 + for axtBlock in bx.align.axt.Reader(open(axt_in), \ + species_to_lengths = speciesToLengths, + species1 = primary, + species2 = secondary, + support_ids = True): + axtsRead += 1 + out.write (axtBlock) + primary_c = axtBlock.get_component_by_src_start(primary) + secondary_c = axtBlock.get_component_by_src_start(secondary) + + print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size) + print >>seq_file1,primary_c.text + print >>seq_file1 + + print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size) + print >>seq_file2,secondary_c.text + print >>seq_file2 + axtsWritten += 1 + + out.close() + seq_file1.close() + seq_file2.close() + + if (not silent): + sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten)) + +def parse_spec(spec): # returns (seq_file,species_name,lengths_file) + fields = spec.split(":") + if (len(fields) == 2): return (fields[0],"",fields[1]) + elif (len(fields) == 3): return (fields[0],fields[1],fields[2]) + else: raise ValueError + +def read_lengths (fileName): + + chromToLength = {} + + f = file (fileName, "r") + + for lineNumber,line in enumerate(f): + line = line.strip() + if (line == ""): continue + if (line.startswith("#")): continue + + fields = line.split () + if (len(fields) != 2): + raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line) + + chrom = fields[0] + try: + length = int(fields[1]) + except: + raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line) + + if (chrom in chromToLength): + raise "%s appears more than once (%s:%d): %s" \ + % (chrom,fileName,lineNumber) + + chromToLength[chrom] = length + + f.close () + + return chromToLength + + +if __name__ == "__main__": main() +