comparison diagonal_partition.py @ 0:5c72425b7f1b draft

planemo upload for repository https://github.com/richard-burhans/galaxytools/tree/main/tools/segalign commit 98a4dd44360447aa96d92143384d78e116d7581b
author richard-burhans
date Wed, 17 Apr 2024 18:06:54 +0000
parents
children 9e34b25a8670
comparison
equal deleted inserted replaced
-1:000000000000 0:5c72425b7f1b
1 #!/usr/bin/env python3
2
3 """
4 Diagonal partitioning for segment files output by SegAlign.
5
6 Usage:
7 diagonal_partition <max-segments> <lastz-command>
8 """
9
10 import os
11 import sys
12
13
14 def chunks(lst, n):
15 """Yield successive n-sized chunks from list."""
16 for i in range(0, len(lst), n):
17 yield lst[i: i + n]
18
19
20 if __name__ == "__main__":
21
22 DELETE_AFTER_CHUNKING = True
23
24 # input_params = "10000 sad sadsa sad --segments=tmp10.block5.r1239937044.plus.segments dsa sa --strand=plus --output=out.maf sadads 2> logging.err"
25 # sys.argv = [sys.argv[0]] + input_params.split(' ')
26 chunk_size = int(sys.argv[1]) # first parameter contains chunk size
27 params = sys.argv[2:]
28
29 # Parsing command output from SegAlign
30 segment_key = "--segments="
31 segment_index = None
32 input_file = None
33
34 for index, value in enumerate(params):
35 if value[: len(segment_key)] == segment_key:
36 segment_index = index
37 input_file = value[len(segment_key):]
38 break
39 if segment_index is None:
40 print(f"Error: could not segment key {segment_key} in parameters {params}")
41 exit(0)
42
43 if not os.path.isfile(input_file):
44 print(f"Error: File {input_file} does not exist")
45 exit(0)
46
47 if (
48 chunk_size == 0 or sum(1 for _ in open(input_file)) <= chunk_size
49 ): # no need to sort if number of lines <= chunk_size
50 print(" ".join(params), flush=True)
51 exit(0)
52
53 # Find rest of relevant parameters
54 output_key = "--output="
55 output_index = None
56 output_alignment_file = None
57 output_alignment_file_base = None
58 output_format = None
59
60 strand_key = "--strand="
61 strand_index = None
62 for index, value in enumerate(params):
63 if value[: len(output_key)] == output_key:
64 output_index = index
65 output_alignment_file = value[len(output_key):]
66 output_alignment_file_base, output_format = output_alignment_file.rsplit(
67 ".", 1
68 )
69 if value[: len(strand_key)] == strand_key:
70 strand_index = index
71 if segment_index is None:
72 print(f"Error: could not output key {output_key} in parameters {params}")
73 exit(0)
74 if strand_index is None:
75 print(f"Error: could not output key {strand_key} in parameters {params}")
76 exit(0)
77
78 err_index = -1 # error file is at very end
79 err_name_base = params[-1].split(".err", 1)[0]
80
81 data = {} # dict of list of tuple (x, y, str)
82
83 direction = None
84 if "plus" in params[strand_index]:
85 direction = "f"
86 elif "minus" in params[strand_index]:
87 direction = "r"
88 else:
89 print(
90 f"Error: could not figure out direction from strand value {params[strand_index]}"
91 )
92 exit(0)
93
94 for line in open(input_file, "r"):
95 if line == "":
96 continue
97 (
98 seq1_name,
99 seq1_start,
100 seq1_end,
101 seq2_name,
102 seq2_start,
103 seq2_end,
104 _dir,
105 score,
106 ) = line.split()
107 # data.append((int(seq1_start), int(seq2_start), line))
108 half_dist = int((int(seq1_end) - int(seq1_start)) // 2)
109 assert int(seq1_end) > int(seq1_start)
110 assert int(seq2_end) > int(seq2_start)
111 seq1_mid = int(seq1_start) + half_dist
112 seq2_mid = int(seq2_start) + half_dist
113 data.setdefault((seq1_name, seq2_name), []).append((seq1_mid, seq2_mid, line))
114
115 # If there are chromosome pairs with segment count <= chunk_size
116 # then no need to sort and split these pairs into separate files.
117 # It is better to keep these pairs in a single segment file.
118 skip_pairs = [] # pairs that have count <= chunk_size.
119 # these will not be sorted
120 if len(data.keys()) > 1:
121 for pair in data.keys():
122 if len(data[pair]) <= chunk_size:
123 skip_pairs.append(pair)
124
125 # sorting for forward segments
126 if direction == "r":
127 for pair in data.keys():
128 if pair not in skip_pairs:
129 data[pair] = sorted(
130 data[pair], key=lambda coord: (coord[1] - coord[0], coord[0])
131 )
132 # sorting for reverse segments
133 elif direction == "f":
134 for pair in data.keys():
135 if pair not in skip_pairs:
136 data[pair] = sorted(
137 data[pair], key=lambda coord: (coord[1] + coord[0], coord[0])
138 )
139 else:
140 print(f"INVALID DIRECTION VALUE: {direction}")
141 exit(0)
142
143 # Writing file in chunks
144 ctr = 0
145 for pair in data.keys() - skip_pairs:
146 for chunk in chunks(list(zip(*data[pair]))[2], chunk_size):
147 ctr += 1
148 name_addition = f".split{ctr}"
149 fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"
150
151 with open(fname, "w") as f:
152 f.writelines(chunk)
153 # update segment file in command
154 params[segment_index] = segment_key + fname
155 # update output file in command
156 params[output_index] = (
157 output_key
158 + output_alignment_file_base
159 + name_addition
160 + "."
161 + output_format
162 )
163 # update error file in command
164 params[-1] = err_name_base + name_addition + ".err"
165 print(" ".join(params), flush=True)
166
167 # writing unsorted skipped pairs
168 if len(skip_pairs) > 0:
169 skip_pairs_with_len = sorted(
170 [(len(data[p]), p) for p in skip_pairs]
171 ) # list of tuples of (pair length, pair)
172 aggregated_skip_pairs = [] # list of list of pair names
173 current_count = 0
174 aggregated_skip_pairs.append([])
175 for count, pair in skip_pairs_with_len:
176 if current_count + count <= chunk_size:
177 current_count += count
178 aggregated_skip_pairs[-1].append(pair)
179 else:
180 aggregated_skip_pairs.append([])
181 current_count = count
182 aggregated_skip_pairs[-1].append(pair)
183
184 for aggregate in aggregated_skip_pairs:
185 ctr += 1
186 name_addition = f".split{ctr}"
187 fname = input_file.split(".segments", 1)[0] + name_addition + ".segments"
188 with open(fname, "w") as f:
189 for pair in sorted(aggregate, key=lambda p: (p[1], p[0])):
190 chunk = list(zip(*data[pair]))[2]
191 f.writelines(chunk)
192 # update segment file in command
193 params[segment_index] = segment_key + fname
194 # update output file in command
195 params[output_index] = (
196 output_key
197 + output_alignment_file_base
198 + name_addition
199 + "."
200 + output_format
201 )
202 # update error file in command
203 params[-1] = err_name_base + name_addition + ".err"
204 print(" ".join(params), flush=True)
205
206 if DELETE_AFTER_CHUNKING:
207 os.remove(input_file)