Mercurial > repos > richard-burhans > segalign
comparison diagonal_partition.py @ 0:5c72425b7f1b draft
planemo upload for repository https://github.com/richard-burhans/galaxytools/tree/main/tools/segalign commit 98a4dd44360447aa96d92143384d78e116d7581b
author | richard-burhans |
---|---|
date | Wed, 17 Apr 2024 18:06:54 +0000 |
parents | |
children | 9e34b25a8670 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5c72425b7f1b |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 """ | |
4 Diagonal partitioning for segment files output by SegAlign. | |
5 | |
6 Usage: | |
7 diagonal_partition <max-segments> <lastz-command> | |
8 """ | |
9 | |
10 import os | |
11 import sys | |
12 | |
13 | |
14 def chunks(lst, n): | |
15 """Yield successive n-sized chunks from list.""" | |
16 for i in range(0, len(lst), n): | |
17 yield lst[i: i + n] | |
18 | |
19 | |
20 if __name__ == "__main__": | |
21 | |
22 DELETE_AFTER_CHUNKING = True | |
23 | |
24 # input_params = "10000 sad sadsa sad --segments=tmp10.block5.r1239937044.plus.segments dsa sa --strand=plus --output=out.maf sadads 2> logging.err" | |
25 # sys.argv = [sys.argv[0]] + input_params.split(' ') | |
26 chunk_size = int(sys.argv[1]) # first parameter contains chunk size | |
27 params = sys.argv[2:] | |
28 | |
29 # Parsing command output from SegAlign | |
30 segment_key = "--segments=" | |
31 segment_index = None | |
32 input_file = None | |
33 | |
34 for index, value in enumerate(params): | |
35 if value[: len(segment_key)] == segment_key: | |
36 segment_index = index | |
37 input_file = value[len(segment_key):] | |
38 break | |
39 if segment_index is None: | |
40 print(f"Error: could not segment key {segment_key} in parameters {params}") | |
41 exit(0) | |
42 | |
43 if not os.path.isfile(input_file): | |
44 print(f"Error: File {input_file} does not exist") | |
45 exit(0) | |
46 | |
47 if ( | |
48 chunk_size == 0 or sum(1 for _ in open(input_file)) <= chunk_size | |
49 ): # no need to sort if number of lines <= chunk_size | |
50 print(" ".join(params), flush=True) | |
51 exit(0) | |
52 | |
53 # Find rest of relevant parameters | |
54 output_key = "--output=" | |
55 output_index = None | |
56 output_alignment_file = None | |
57 output_alignment_file_base = None | |
58 output_format = None | |
59 | |
60 strand_key = "--strand=" | |
61 strand_index = None | |
62 for index, value in enumerate(params): | |
63 if value[: len(output_key)] == output_key: | |
64 output_index = index | |
65 output_alignment_file = value[len(output_key):] | |
66 output_alignment_file_base, output_format = output_alignment_file.rsplit( | |
67 ".", 1 | |
68 ) | |
69 if value[: len(strand_key)] == strand_key: | |
70 strand_index = index | |
71 if segment_index is None: | |
72 print(f"Error: could not output key {output_key} in parameters {params}") | |
73 exit(0) | |
74 if strand_index is None: | |
75 print(f"Error: could not output key {strand_key} in parameters {params}") | |
76 exit(0) | |
77 | |
78 err_index = -1 # error file is at very end | |
79 err_name_base = params[-1].split(".err", 1)[0] | |
80 | |
81 data = {} # dict of list of tuple (x, y, str) | |
82 | |
83 direction = None | |
84 if "plus" in params[strand_index]: | |
85 direction = "f" | |
86 elif "minus" in params[strand_index]: | |
87 direction = "r" | |
88 else: | |
89 print( | |
90 f"Error: could not figure out direction from strand value {params[strand_index]}" | |
91 ) | |
92 exit(0) | |
93 | |
94 for line in open(input_file, "r"): | |
95 if line == "": | |
96 continue | |
97 ( | |
98 seq1_name, | |
99 seq1_start, | |
100 seq1_end, | |
101 seq2_name, | |
102 seq2_start, | |
103 seq2_end, | |
104 _dir, | |
105 score, | |
106 ) = line.split() | |
107 # data.append((int(seq1_start), int(seq2_start), line)) | |
108 half_dist = int((int(seq1_end) - int(seq1_start)) // 2) | |
109 assert int(seq1_end) > int(seq1_start) | |
110 assert int(seq2_end) > int(seq2_start) | |
111 seq1_mid = int(seq1_start) + half_dist | |
112 seq2_mid = int(seq2_start) + half_dist | |
113 data.setdefault((seq1_name, seq2_name), []).append((seq1_mid, seq2_mid, line)) | |
114 | |
115 # If there are chromosome pairs with segment count <= chunk_size | |
116 # then no need to sort and split these pairs into separate files. | |
117 # It is better to keep these pairs in a single segment file. | |
118 skip_pairs = [] # pairs that have count <= chunk_size. | |
119 # these will not be sorted | |
120 if len(data.keys()) > 1: | |
121 for pair in data.keys(): | |
122 if len(data[pair]) <= chunk_size: | |
123 skip_pairs.append(pair) | |
124 | |
125 # sorting for forward segments | |
126 if direction == "r": | |
127 for pair in data.keys(): | |
128 if pair not in skip_pairs: | |
129 data[pair] = sorted( | |
130 data[pair], key=lambda coord: (coord[1] - coord[0], coord[0]) | |
131 ) | |
132 # sorting for reverse segments | |
133 elif direction == "f": | |
134 for pair in data.keys(): | |
135 if pair not in skip_pairs: | |
136 data[pair] = sorted( | |
137 data[pair], key=lambda coord: (coord[1] + coord[0], coord[0]) | |
138 ) | |
139 else: | |
140 print(f"INVALID DIRECTION VALUE: {direction}") | |
141 exit(0) | |
142 | |
143 # Writing file in chunks | |
144 ctr = 0 | |
145 for pair in data.keys() - skip_pairs: | |
146 for chunk in chunks(list(zip(*data[pair]))[2], chunk_size): | |
147 ctr += 1 | |
148 name_addition = f".split{ctr}" | |
149 fname = input_file.split(".segments", 1)[0] + name_addition + ".segments" | |
150 | |
151 with open(fname, "w") as f: | |
152 f.writelines(chunk) | |
153 # update segment file in command | |
154 params[segment_index] = segment_key + fname | |
155 # update output file in command | |
156 params[output_index] = ( | |
157 output_key | |
158 + output_alignment_file_base | |
159 + name_addition | |
160 + "." | |
161 + output_format | |
162 ) | |
163 # update error file in command | |
164 params[-1] = err_name_base + name_addition + ".err" | |
165 print(" ".join(params), flush=True) | |
166 | |
167 # writing unsorted skipped pairs | |
168 if len(skip_pairs) > 0: | |
169 skip_pairs_with_len = sorted( | |
170 [(len(data[p]), p) for p in skip_pairs] | |
171 ) # list of tuples of (pair length, pair) | |
172 aggregated_skip_pairs = [] # list of list of pair names | |
173 current_count = 0 | |
174 aggregated_skip_pairs.append([]) | |
175 for count, pair in skip_pairs_with_len: | |
176 if current_count + count <= chunk_size: | |
177 current_count += count | |
178 aggregated_skip_pairs[-1].append(pair) | |
179 else: | |
180 aggregated_skip_pairs.append([]) | |
181 current_count = count | |
182 aggregated_skip_pairs[-1].append(pair) | |
183 | |
184 for aggregate in aggregated_skip_pairs: | |
185 ctr += 1 | |
186 name_addition = f".split{ctr}" | |
187 fname = input_file.split(".segments", 1)[0] + name_addition + ".segments" | |
188 with open(fname, "w") as f: | |
189 for pair in sorted(aggregate, key=lambda p: (p[1], p[0])): | |
190 chunk = list(zip(*data[pair]))[2] | |
191 f.writelines(chunk) | |
192 # update segment file in command | |
193 params[segment_index] = segment_key + fname | |
194 # update output file in command | |
195 params[output_index] = ( | |
196 output_key | |
197 + output_alignment_file_base | |
198 + name_addition | |
199 + "." | |
200 + output_format | |
201 ) | |
202 # update error file in command | |
203 params[-1] = err_name_base + name_addition + ".err" | |
204 print(" ".join(params), flush=True) | |
205 | |
206 if DELETE_AFTER_CHUNKING: | |
207 os.remove(input_file) |