0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Application to convert AXT file to LAV file
|
|
4 -------------------------------------------
|
|
5
|
|
6 :Author: Bob Harris (rsharris@bx.psu.edu)
|
|
7 :Version: $Revision: $
|
|
8
|
|
9 The application reads an AXT file from standard input and writes a LAV file to
|
|
10 standard out; some statistics are written to standard error.
|
|
11 """
|
|
12
|
|
13 import sys, copy
|
|
14 from galaxy import eggs
|
|
15 import pkg_resources
|
|
16 pkg_resources.require( "bx-python" )
|
|
17 import bx.align.axt
|
|
18 import bx.align.lav
|
|
19
|
|
20 assert sys.version_info[:2] >= ( 2, 4 )
|
|
21
|
|
22 def usage(s=None):
|
|
23 message = """
|
|
24 axt_to_lav primary_spec secondary_spec [--silent] < axt_file > lav_file
|
|
25 Each spec is of the form seq_file[:species_name]:lengths_file.
|
|
26
|
|
27 seq_file should be a format string for the file names for the individual
|
|
28 sequences, with %s to be replaced by the alignment's src field. For example,
|
|
29 "hg18/%s.nib" would prescribe files named "hg18/chr1.nib", "hg18/chr2.nib",
|
|
30 etc.
|
|
31
|
|
32 species_name is optional. If present, it is prepended to the alignment's src
|
|
33 field.
|
|
34
|
|
35 Lengths files provide the length of each chromosome (lav format needs this
|
|
36 information but axt file does not contain it). The format is a series of
|
|
37 lines of the form
|
|
38 <chromosome name> <length>
|
|
39 The chromosome field in each axt block must match some <chromosome name> in
|
|
40 the lengths file.
|
|
41 """
|
|
42 if (s == None): sys.exit (message)
|
|
43 else: sys.exit ("%s\n%s" % (s,message))
|
|
44
|
|
45
|
|
46 def main():
|
|
47 global debug
|
|
48
|
|
49 # parse the command line
|
|
50
|
|
51 primary = None
|
|
52 secondary = None
|
|
53 silent = False
|
|
54
|
|
55 # pick off options
|
|
56
|
|
57 args = sys.argv[1:]
|
|
58 seq_file2 = open(args.pop(-1),'w')
|
|
59 seq_file1 = open(args.pop(-1),'w')
|
|
60 lav_out = args.pop(-1)
|
|
61 axt_in = args.pop(-1)
|
|
62 while (len(args) > 0):
|
|
63 arg = args.pop(0)
|
|
64 val = None
|
|
65 fields = arg.split("=",1)
|
|
66 if (len(fields) == 2):
|
|
67 arg = fields[0]
|
|
68 val = fields[1]
|
|
69 if (val == ""):
|
|
70 usage("missing a value in %s=" % arg)
|
|
71
|
|
72 if (arg == "--silent") and (val == None):
|
|
73 silent = True
|
|
74 elif (primary == None) and (val == None):
|
|
75 primary = arg
|
|
76 elif (secondary == None) and (val == None):
|
|
77 secondary = arg
|
|
78 else:
|
|
79 usage("unknown argument: %s" % arg)
|
|
80
|
|
81 if (primary == None):
|
|
82 usage("missing primary file name and length")
|
|
83
|
|
84 if (secondary == None):
|
|
85 usage("missing secondary file name and length")
|
|
86
|
|
87 try:
|
|
88 (primaryFile,primary,primaryLengths) = parse_spec(primary)
|
|
89 except:
|
|
90 usage("bad primary spec (must be seq_file[:species_name]:lengths_file")
|
|
91
|
|
92 try:
|
|
93 (secondaryFile,secondary,secondaryLengths) = parse_spec(secondary)
|
|
94 except:
|
|
95 usage("bad secondary spec (must be seq_file[:species_name]:lengths_file")
|
|
96
|
|
97 # read the lengths
|
|
98
|
|
99 speciesToLengths = {}
|
|
100 speciesToLengths[primary] = read_lengths (primaryLengths)
|
|
101 speciesToLengths[secondary] = read_lengths (secondaryLengths)
|
|
102
|
|
103 # read the alignments
|
|
104
|
|
105 out = bx.align.lav.Writer(open(lav_out,'w'), \
|
|
106 attributes = { "name_format_1" : primaryFile,
|
|
107 "name_format_2" : secondaryFile })
|
|
108
|
|
109 axtsRead = 0
|
|
110 axtsWritten = 0
|
|
111 for axtBlock in bx.align.axt.Reader(open(axt_in), \
|
|
112 species_to_lengths = speciesToLengths,
|
|
113 species1 = primary,
|
|
114 species2 = secondary,
|
|
115 support_ids = True):
|
|
116 axtsRead += 1
|
|
117 out.write (axtBlock)
|
|
118 primary_c = axtBlock.get_component_by_src_start(primary)
|
|
119 secondary_c = axtBlock.get_component_by_src_start(secondary)
|
|
120
|
|
121 print >>seq_file1, ">%s_%s_%s_%s" % (primary_c.src,secondary_c.strand,primary_c.start,primary_c.start+primary_c.size)
|
|
122 print >>seq_file1,primary_c.text
|
|
123 print >>seq_file1
|
|
124
|
|
125 print >>seq_file2, ">%s_%s_%s_%s" % (secondary_c.src,secondary_c.strand,secondary_c.start,secondary_c.start+secondary_c.size)
|
|
126 print >>seq_file2,secondary_c.text
|
|
127 print >>seq_file2
|
|
128 axtsWritten += 1
|
|
129
|
|
130 out.close()
|
|
131 seq_file1.close()
|
|
132 seq_file2.close()
|
|
133
|
|
134 if (not silent):
|
|
135 sys.stdout.write ("%d blocks read, %d written\n" % (axtsRead,axtsWritten))
|
|
136
|
|
137 def parse_spec(spec): # returns (seq_file,species_name,lengths_file)
|
|
138 fields = spec.split(":")
|
|
139 if (len(fields) == 2): return (fields[0],"",fields[1])
|
|
140 elif (len(fields) == 3): return (fields[0],fields[1],fields[2])
|
|
141 else: raise ValueError
|
|
142
|
|
143 def read_lengths (fileName):
|
|
144
|
|
145 chromToLength = {}
|
|
146
|
|
147 f = file (fileName, "r")
|
|
148
|
|
149 for lineNumber,line in enumerate(f):
|
|
150 line = line.strip()
|
|
151 if (line == ""): continue
|
|
152 if (line.startswith("#")): continue
|
|
153
|
|
154 fields = line.split ()
|
|
155 if (len(fields) != 2):
|
|
156 raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
|
|
157
|
|
158 chrom = fields[0]
|
|
159 try:
|
|
160 length = int(fields[1])
|
|
161 except:
|
|
162 raise "bad lengths line (%s:%d): %s" % (fileName,lineNumber,line)
|
|
163
|
|
164 if (chrom in chromToLength):
|
|
165 raise "%s appears more than once (%s:%d): %s" \
|
|
166 % (chrom,fileName,lineNumber)
|
|
167
|
|
168 chromToLength[chrom] = length
|
|
169
|
|
170 f.close ()
|
|
171
|
|
172 return chromToLength
|
|
173
|
|
174
|
|
175 if __name__ == "__main__": main()
|
|
176
|