Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 9:baabc30154cd draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 37305ce5e6ef1b91f6058eff5b128064f76fbe45
author | bgruening |
---|---|
date | Thu, 23 Nov 2023 20:02:01 +0000 |
parents | 6cbe2f30c2d7 |
children | 2dae863c8f42 |
comparison
equal
deleted
inserted
replaced
8:6cbe2f30c2d7 | 9:baabc30154cd |
---|---|
11 # - number of lines to split after (0 if not splitting by number of lines but regex) | 11 # - number of lines to split after (0 if not splitting by number of lines but regex) |
12 # - a boolean indicating if the record separator is at the end of the record | 12 # - a boolean indicating if the record separator is at the end of the record |
13 # | 13 # |
14 # new file types can be added by appending to this dict, | 14 # new file types can be added by appending to this dict, |
15 # updating the parser, and adding a new type option in the Galaxy wrapper | 15 # updating the parser, and adding a new type option in the Galaxy wrapper |
16 FILETYPES = {'fasta': (r'^>', 0, False), | 16 FILETYPES = { |
17 'fastq': (r'', 4, False), | 17 "fasta": (r"^>", 0, False), |
18 'tabular': (r'', 1, False), | 18 "fastq": (r"", 4, False), |
19 'txt': (r'', 1, False), | 19 "tabular": (r"", 1, False), |
20 'mgf': (r'^BEGIN IONS', 0, False), | 20 "txt": (r"", 1, False), |
21 'sdf': (r'\$\$\$\$', 0, True), | 21 "mgf": (r"^BEGIN IONS", 0, False), |
22 } | 22 "sdf": (r"\$\$\$\$", 0, True), |
23 } | |
23 | 24 |
24 | 25 |
25 def main(): | 26 def main(): |
26 ps = parser_cli() | 27 ps = parser_cli() |
27 args = vars(ps.parse_args()) | 28 args = vars(ps.parse_args()) |
28 | 29 |
29 # get args and validate | 30 # get args and validate |
30 in_file = args["in"] | 31 in_file = args["in"] |
31 if not os.path.isfile(args["in"]): | 32 if not os.path.isfile(args["in"]): |
32 raise FileNotFoundError('Input file does not exist') | 33 raise FileNotFoundError("Input file does not exist") |
33 | 34 |
34 out_dir = args["out_dir"] | 35 out_dir = args["out_dir"] |
35 if not os.path.isdir(args["out_dir"]): | 36 if not os.path.isdir(args["out_dir"]): |
36 raise FileNotFoundError('out_dir is not a directory') | 37 raise FileNotFoundError("out_dir is not a directory") |
37 | 38 |
38 top = args["top"] | 39 top = args["top"] |
39 if top < 0: | 40 if top < 0: |
40 raise ValueError("Number of header lines cannot be negative") | 41 raise ValueError("Number of header lines cannot be negative") |
41 | 42 |
42 ftype = args["ftype"] | 43 ftype = args["ftype"] |
43 | 44 |
44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input" | 45 assert ( |
46 ftype != "generic" or args["generic_re"] is not None | |
47 ), "--generic_re needs to be given for generic input" | |
45 | 48 |
46 if args["ftype"] == "tabular" and args["by"] == "col": | 49 if args["ftype"] == "tabular" and args["by"] == "col": |
47 args["match"] = replace_mapped_chars(args["match"]) | 50 args["match"] = replace_mapped_chars(args["match"]) |
48 args["sub"] = replace_mapped_chars(args["sub"]) | 51 args["sub"] = replace_mapped_chars(args["sub"]) |
49 split_by_column(args, in_file, out_dir, top) | 52 split_by_column(args, in_file, out_dir, top) |
51 args["generic_re"] = replace_mapped_chars(args["generic_re"]) | 54 args["generic_re"] = replace_mapped_chars(args["generic_re"]) |
52 split_by_record(args, in_file, out_dir, top, ftype) | 55 split_by_record(args, in_file, out_dir, top, ftype) |
53 | 56 |
54 | 57 |
55 def parser_cli(): | 58 def parser_cli(): |
56 parser = argparse.ArgumentParser(description="split a file into multiple files. " + | 59 parser = argparse.ArgumentParser( |
57 "Can split on the column of a tabular file, " + | 60 description="split a file into multiple files. " |
58 "with custom and useful names based on column value.") | 61 + "Can split on the column of a tabular file, " |
59 parser.add_argument('--in', '-i', required=True, help="The input file") | 62 + "with custom and useful names based on column value." |
60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) | 63 ) |
61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | 64 parser.add_argument("--in", "-i", required=True, help="The input file") |
62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | 65 parser.add_argument( |
63 " the extension of the new files (without a period)") | 66 "--out_dir", |
64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True, | 67 "-o", |
65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) | 68 default=os.getcwd(), |
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | 69 help="The output directory", |
67 default="row", choices=["col", "row"]) | 70 required=True, |
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") | 71 ) |
69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | 72 parser.add_argument( |
70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | 73 "--file_names", |
71 "If not provided and args[\"rand\"]==True, then date is used", type=int) | 74 "-a", |
75 help="If not splitting by column, the base name of the new files", | |
76 ) | |
77 parser.add_argument( | |
78 "--file_ext", | |
79 "-e", | |
80 help="If not splitting by column," | |
81 + " the extension of the new files (without a period)", | |
82 ) | |
83 parser.add_argument( | |
84 "--ftype", | |
85 "-f", | |
86 help="The type of the file to split", | |
87 required=True, | |
88 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"], | |
89 ) | |
90 parser.add_argument( | |
91 "--by", | |
92 "-b", | |
93 help="Split by line or by column (tabular only)", | |
94 default="row", | |
95 choices=["col", "row"], | |
96 ) | |
97 parser.add_argument( | |
98 "--top", | |
99 "-t", | |
100 type=int, | |
101 default=0, | |
102 help="Number of header lines to carry over to new files.", | |
103 ) | |
104 parser.add_argument( | |
105 "--rand", | |
106 "-r", | |
107 help="Divide records randomly into new files", | |
108 action="store_true", | |
109 ) | |
110 parser.add_argument( | |
111 "--seed", | |
112 "-x", | |
113 help="Provide a seed for the random number generator. " | |
114 + 'If not provided and args["rand"]==True, then date is used', | |
115 type=int, | |
116 ) | |
72 group = parser.add_mutually_exclusive_group() | 117 group = parser.add_mutually_exclusive_group() |
73 group.add_argument('--numnew', '-n', type=int, default=1, | 118 group.add_argument( |
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") | 119 "--numnew", |
75 group.add_argument('--chunksize', '-k', type=int, default=0, | 120 "-n", |
76 help="Number of records by file. Not valid for splitting on a column") | 121 type=int, |
77 parser.add_argument('--batch', action='store_true', | 122 default=1, |
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | 123 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.", |
79 generic = parser.add_argument_group('Arguments controling generic splitting') | 124 ) |
125 group.add_argument( | |
126 "--chunksize", | |
127 "-k", | |
128 type=int, | |
129 default=0, | |
130 help="Number of records by file. Not valid for splitting on a column", | |
131 ) | |
132 parser.add_argument( | |
133 "--batch", | |
134 action="store_true", | |
135 help="Distribute files to collection while maintaining order. Ignored if splitting on column.", | |
136 ) | |
137 generic = parser.add_argument_group("Arguments controling generic splitting") | |
80 group = generic.add_mutually_exclusive_group() | 138 group = generic.add_mutually_exclusive_group() |
81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False) | 139 group.add_argument( |
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False) | 140 "--generic_re", |
83 generic.add_argument('--split_after', '-p', action='store_true', | 141 "-g", |
84 help="Split between records after separator (default is before). " + | 142 default="", |
85 "Only for generic splitting by regex - specific ftypes are always split in the default way") | 143 help="Regular expression indicating the start of a new record (only for generic)", |
86 bycol = parser.add_argument_group('If splitting on a column') | 144 required=False, |
87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries") | 145 ) |
88 bycol.add_argument('--sub', '-s', default=r'\1', | 146 group.add_argument( |
89 help="The regular expression to substitute in for the matched pattern.") | 147 "--generic_num", |
90 bycol.add_argument('--id_column', '-c', default="1", | 148 type=int, |
91 help="Column that is used to name output files. Indexed starting from 1.", type=int) | 149 default=0, |
150 help="Length of records in number of lines (only for generic)", | |
151 required=False, | |
152 ) | |
153 generic.add_argument( | |
154 "--split_after", | |
155 "-p", | |
156 action="store_true", | |
157 help="Split between records after separator (default is before). " | |
158 + "Only for generic splitting by regex - specific ftypes are always split in the default way", | |
159 ) | |
160 bycol = parser.add_argument_group("If splitting on a column") | |
161 bycol.add_argument( | |
162 "--match", | |
163 "-m", | |
164 default="(.*)", | |
165 help="The regular expression to match id column entries", | |
166 ) | |
167 bycol.add_argument( | |
168 "--sub", | |
169 "-s", | |
170 default=r"\1", | |
171 help="The regular expression to substitute in for the matched pattern.", | |
172 ) | |
173 bycol.add_argument( | |
174 "--id_column", | |
175 "-c", | |
176 default="1", | |
177 help="Column that is used to name output files. Indexed starting from 1.", | |
178 type=int, | |
179 ) | |
92 return parser | 180 return parser |
93 | 181 |
94 | 182 |
95 def replace_mapped_chars(pattern): | 183 def replace_mapped_chars(pattern): |
96 """ | 184 """ |
97 handles special escaped characters when coming from galaxy | 185 handles special escaped characters when coming from galaxy |
98 """ | 186 """ |
99 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} | 187 mapped_chars = {"'": "__sq__", "\\": "__backslash__"} |
100 for key, value in mapped_chars.items(): | 188 for key, value in mapped_chars.items(): |
101 pattern = pattern.replace(value, key) | 189 pattern = pattern.replace(value, key) |
102 return pattern | 190 return pattern |
103 | 191 |
104 | 192 |
105 def split_by_record(args, in_file, out_dir, top, ftype): | 193 def split_by_record(args, in_file, out_dir, top, ftype): |
106 # get configuration (record separator, start at end) for given filetype | 194 # get configuration (record separator, start at end) for given filetype |
107 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) | 195 sep, num, sep_at_end = FILETYPES.get( |
196 ftype, (args["generic_re"], args["generic_num"], args["split_after"]) | |
197 ) | |
108 sep = re.compile(sep) | 198 sep = re.compile(sep) |
109 | 199 |
110 chunksize = args["chunksize"] | 200 chunksize = args["chunksize"] |
111 numnew = args["numnew"] | 201 numnew = args["numnew"] |
112 | 202 |
124 # determine | 214 # determine |
125 # - the number of records that should be stored per file | 215 # - the number of records that should be stored per file |
126 # (done always, even if used only for batch mode) | 216 # (done always, even if used only for batch mode) |
127 # - if the separator is a the start / end of the record | 217 # - if the separator is a the start / end of the record |
128 n_per_file = math.inf | 218 n_per_file = math.inf |
129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected | 219 if ( |
220 chunksize != 0 or batch | |
221 ): # needs to be calculated if either batch or chunksize are selected | |
130 with open(in_file) as f: | 222 with open(in_file) as f: |
131 # read header lines | 223 # read header lines |
132 for i in range(top): | 224 for i in range(top): |
133 f.readline() | 225 f.readline() |
134 n_records = 0 | 226 n_records = 0 |
227 last_line_matched = False | |
135 for line in f: | 228 for line in f: |
136 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): | 229 if (num == 0 and re.match(sep, line) is not None) or ( |
230 num > 0 and n_records % num == 0 | |
231 ): | |
137 n_records += 1 | 232 n_records += 1 |
138 last_line_matched = True | 233 last_line_matched = True |
139 else: | 234 else: |
140 last_line_matched = False | 235 last_line_matched = False |
141 if sep_at_end and not last_line_matched: | 236 if sep_at_end and not last_line_matched: |
145 numnew = min(numnew, n_records) | 240 numnew = min(numnew, n_records) |
146 # approx. number of records per file | 241 # approx. number of records per file |
147 if chunksize == 0: # i.e. no chunking | 242 if chunksize == 0: # i.e. no chunking |
148 n_per_file = n_records // numnew | 243 n_per_file = n_records // numnew |
149 else: | 244 else: |
150 numnew = n_records // chunksize | 245 numnew = max(n_records // chunksize, 1) # should not be less than 1 |
151 n_per_file = chunksize | 246 n_per_file = chunksize |
152 | 247 |
153 # make new files | 248 # make new files |
154 # strip extension of old file and add number | 249 # strip extension of old file and add number |
155 custom_new_file_name = args["file_names"] | 250 custom_new_file_name = args["file_names"] |
157 if custom_new_file_name is None: | 252 if custom_new_file_name is None: |
158 new_file_base = os.path.splitext(os.path.basename(in_file)) | 253 new_file_base = os.path.splitext(os.path.basename(in_file)) |
159 else: | 254 else: |
160 new_file_base = [custom_new_file_name, custom_new_file_ext] | 255 new_file_base = [custom_new_file_name, custom_new_file_ext] |
161 | 256 |
162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)] | 257 newfile_names = [ |
258 os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) | |
259 for count in range(0, numnew) | |
260 ] | |
163 # bunch o' counters | 261 # bunch o' counters |
164 # index to list of new files | 262 # index to list of new files |
165 if rand: | 263 if rand: |
166 new_file_counter = int(math.floor(random.random() * numnew)) | 264 new_file_counter = int(math.floor(random.random() * numnew)) |
167 else: | 265 else: |
184 | 282 |
185 record = "" | 283 record = "" |
186 for line_no, line in enumerate(f): | 284 for line_no, line in enumerate(f): |
187 # check if beginning of line is record sep | 285 # check if beginning of line is record sep |
188 # if beginning of line is record sep, either start record or finish one | 286 # if beginning of line is record sep, either start record or finish one |
189 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): | 287 if (num == 0 and re.match(sep, line) is not None) or ( |
288 num > 0 and line_no % num == 0 | |
289 ): | |
190 # this only happens first time through | 290 # this only happens first time through |
191 if record == "": | 291 if record == "": |
192 record += line | 292 record += line |
193 else: | 293 else: |
194 # if is in fresh_files, write header and drop from freshFiles | 294 # if is in fresh_files, write header and drop from freshFiles |
258 n_read += 1 | 358 n_read += 1 |
259 if n_read <= top: | 359 if n_read <= top: |
260 header += line | 360 header += line |
261 continue | 361 continue |
262 # split into columns, on tab | 362 # split into columns, on tab |
263 fields = re.split(r'\t', line.strip('\n')) | 363 fields = re.split(r"\t", line.strip("\n")) |
264 | 364 |
265 # get id column value | 365 # get id column value |
266 id_col_val = fields[id_col] | 366 id_col_val = fields[id_col] |
267 | 367 |
268 # use regex to get new file name | 368 # use regex to get new file name |