annotate split_file_to_collection.py @ 8:6cbe2f30c2d7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
author bgruening
date Sun, 12 Jul 2020 10:27:06 -0400
parents 0046692724f9
children baabc30154cd
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
2
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
3 import argparse
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
4 import math
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
5 import os
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
6 import random
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
7 import re
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
8
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
9 # configuration of the splitting for specific file types
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
11 # - number of lines to split after (0 if not splitting by number of lines but regex)
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
12 # - a boolean indicating if the record separator is at the end of the record
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
13 #
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
14 # new file types can be added by appending to this dict,
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
15 # updating the parser, and adding a new type option in the Galaxy wrapper
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
16 FILETYPES = {'fasta': (r'^>', 0, False),
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
17 'fastq': (r'', 4, False),
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
18 'tabular': (r'', 1, False),
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
19 'txt': (r'', 1, False),
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
20 'mgf': (r'^BEGIN IONS', 0, False),
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
21 'sdf': (r'\$\$\$\$', 0, True),
4
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
22 }
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
23
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
24
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
25 def main():
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
26 ps = parser_cli()
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
27 args = vars(ps.parse_args())
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
28
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
29 # get args and validate
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
30 in_file = args["in"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
31 if not os.path.isfile(args["in"]):
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
32 raise FileNotFoundError('Input file does not exist')
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
33
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
34 out_dir = args["out_dir"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
35 if not os.path.isdir(args["out_dir"]):
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
36 raise FileNotFoundError('out_dir is not a directory')
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
37
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
38 top = args["top"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
39 if top < 0:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
40 raise ValueError("Number of header lines cannot be negative")
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
41
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
42 ftype = args["ftype"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
43
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"
2
d150ac3d853d "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 06ffe450bafa280eee8a4331c9cfc9e1ece7c522"
bgruening
parents: 0
diff changeset
45
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
46 if args["ftype"] == "tabular" and args["by"] == "col":
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
47 args["match"] = replace_mapped_chars(args["match"])
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
48 args["sub"] = replace_mapped_chars(args["sub"])
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
49 split_by_column(args, in_file, out_dir, top)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
50 else:
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
51 args["generic_re"] = replace_mapped_chars(args["generic_re"])
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
52 split_by_record(args, in_file, out_dir, top, ftype)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
53
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
54
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
55 def parser_cli():
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
56 parser = argparse.ArgumentParser(description="split a file into multiple files. " +
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
57 "Can split on the column of a tabular file, " +
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
58 "with custom and useful names based on column value.")
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
59 parser.add_argument('--in', '-i', required=True, help="The input file")
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
63 " the extension of the new files (without a period)")
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True,
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
67 default="row", choices=["col", "row"])
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
71 "If not provided and args[\"rand\"]==True, then date is used", type=int)
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
72 group = parser.add_mutually_exclusive_group()
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
73 group.add_argument('--numnew', '-n', type=int, default=1,
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
75 group.add_argument('--chunksize', '-k', type=int, default=0,
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
76 help="Number of records by file. Not valid for splitting on a column")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
77 parser.add_argument('--batch', action='store_true',
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
79 generic = parser.add_argument_group('Arguments controling generic splitting')
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
80 group = generic.add_mutually_exclusive_group()
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False)
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False)
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
83 generic.add_argument('--split_after', '-p', action='store_true',
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
84 help="Split between records after separator (default is before). " +
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
85 "Only for generic splitting by regex - specific ftypes are always split in the default way")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
86 bycol = parser.add_argument_group('If splitting on a column')
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries")
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
88 bycol.add_argument('--sub', '-s', default=r'\1',
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
89 help="The regular expression to substitute in for the matched pattern.")
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
90 bycol.add_argument('--id_column', '-c', default="1",
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
91 help="Column that is used to name output files. Indexed starting from 1.", type=int)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
92 return parser
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
93
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
94
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
95 def replace_mapped_chars(pattern):
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
96 """
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
97 handles special escaped characters when coming from galaxy
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
98 """
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
99 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'}
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
100 for key, value in mapped_chars.items():
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
101 pattern = pattern.replace(value, key)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
102 return pattern
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
103
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
104
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
105 def split_by_record(args, in_file, out_dir, top, ftype):
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
106 # get configuration (record separator, start at end) for given filetype
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
107 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
108 sep = re.compile(sep)
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
109
4
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
110 chunksize = args["chunksize"]
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
111 numnew = args["numnew"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
112
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
113 # random division
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
114 rand = args["rand"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
115 seed = args["seed"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
116 if seed:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
117 random.seed(seed)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
118 else:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
119 random.seed()
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
120
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
121 # batched division (maintains order)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
122 batch = args["batch"]
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
123
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
124 # determine
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
125 # - the number of records that should be stored per file
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
126 # (done always, even if used only for batch mode)
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
127 # - if the separator is a the start / end of the record
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
128 n_per_file = math.inf
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
130 with open(in_file) as f:
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
131 # read header lines
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
132 for i in range(top):
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
133 f.readline()
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
134 n_records = 0
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
135 for line in f:
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
136 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
137 n_records += 1
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
138 last_line_matched = True
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
139 else:
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
140 last_line_matched = False
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
141 if sep_at_end and not last_line_matched:
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
142 n_records += 1
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
143
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
144 # if there are fewer records than desired files
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
145 numnew = min(numnew, n_records)
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
146 # approx. number of records per file
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
147 if chunksize == 0: # i.e. no chunking
4
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
148 n_per_file = n_records // numnew
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
149 else:
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
150 numnew = n_records // chunksize
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
151 n_per_file = chunksize
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
152
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
153 # make new files
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
154 # strip extension of old file and add number
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
155 custom_new_file_name = args["file_names"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
156 custom_new_file_ext = "." + args["file_ext"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
157 if custom_new_file_name is None:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
158 new_file_base = os.path.splitext(os.path.basename(in_file))
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
159 else:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
160 new_file_base = [custom_new_file_name, custom_new_file_ext]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
161
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
163 # bunch o' counters
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
164 # index to list of new files
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
165 if rand:
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
166 new_file_counter = int(math.floor(random.random() * numnew))
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
167 else:
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
168 new_file_counter = 0
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
169 new_file = open(newfile_names[new_file_counter], "a")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
170 # to contain header specified by top
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
171 header = ""
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
172 # keep track of the files that have been opened so far
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
173 fresh_files = set(range(numnew))
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
174
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
175 # keep track in loop of number of records in each file
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
176 # only used in batch
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
177 records_in_file = 0
3
2ddc36385d7a "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 8d069684e155d2f5b6fae06d14d98ce41321da53"
bgruening
parents: 2
diff changeset
178
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
179 # open file
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
180 with open(in_file, "r") as f:
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
181 # read header
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
182 for i in range(top):
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
183 header += f.readline()
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
184
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
185 record = ""
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
186 for line_no, line in enumerate(f):
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
187 # check if beginning of line is record sep
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
188 # if beginning of line is record sep, either start record or finish one
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
189 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
190 # this only happens first time through
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
191 if record == "":
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
192 record += line
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
193 else:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
194 # if is in fresh_files, write header and drop from freshFiles
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
195 if new_file_counter in fresh_files:
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
196 new_file.write(header)
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
197 fresh_files.remove(new_file_counter)
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
198
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
199 if sep_at_end:
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
200 record += line
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
201 # write record to file
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
202 new_file.write(record)
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
203 if not sep_at_end:
4
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
204 record = line
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
205 else:
4
0850f2dfba13 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents: 3
diff changeset
206 record = ""
5
e77b954f0da5 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents: 4
diff changeset
207
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
208 # change destination file
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
209 if rand:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
210 new_file_counter = int(math.floor(random.random() * numnew))
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
211 new_file.close()
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
212 new_file = open(newfile_names[new_file_counter], "a")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
213 elif batch:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
214 # number of records read per file
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
215 records_in_file += 1
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
216 # have we reached the max for each file?
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
217 # if so, switch file
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
218 if records_in_file >= n_per_file:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
219 new_file_counter = (new_file_counter + 1) % numnew
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
220 records_in_file = 0 # reset to 0
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
221 new_file.close()
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
222 new_file = open(newfile_names[new_file_counter], "a")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
223 else:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
224 new_file_counter = (new_file_counter + 1) % numnew
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
225 new_file.close()
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
226 new_file = open(newfile_names[new_file_counter], "a")
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
227 # if beginning of line is not record sep, we must be inside a record
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
228 # so just append
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
229 else:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
230 record += line
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
231 # after loop, write final record to file
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
232 new_file.write(record)
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
233 new_file.close()
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
234
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
235
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
236 def split_by_column(args, in_file, out_dir, top):
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
237
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
238 # shift to 0-based indexing
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
239 id_col = int(args["id_column"]) - 1
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
240
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
241 try:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
242 match = re.compile(args["match"])
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
243 except re.error:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
244 print("ERROR: Match (-m) supplied is not valid regex.")
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
245 raise
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
246
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
247 sub = args["sub"]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
248
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
249 # set of file names
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
250 files = set()
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
251
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
252 # keep track of how many lines have been read
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
253 n_read = 0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
254 header = ""
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
255 with open(in_file) as file:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
256 for line in file:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
257 # if still in top, save to header
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
258 n_read += 1
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
259 if n_read <= top:
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
260 header += line
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
261 continue
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
262 # split into columns, on tab
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
263 fields = re.split(r'\t', line.strip('\n'))
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
264
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
265 # get id column value
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
266 id_col_val = fields[id_col]
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
267
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
268 # use regex to get new file name
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
269 out_file_name = re.sub(match, sub, id_col_val)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
270 out_file_path = os.path.join(out_dir, out_file_name)
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
271
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
272 # write
8
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
273 with open(out_file_path, "a") as current_new_file:
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
274 if out_file_name not in files:
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
275 current_new_file.write(header)
6cbe2f30c2d7 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents: 7
diff changeset
276 files.add(out_file_name)
7
0046692724f9 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6f78214d2c9d7786bfc9d8cbddac7d2613cd314e"
bgruening
parents: 6
diff changeset
277 current_new_file.write(line)
0
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
278
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
279
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
280 if __name__ == "__main__":
de3c2c88e710 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff changeset
281 main()