comparison split_file_to_collection.py @ 4:0850f2dfba13 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
author bgruening
date Wed, 09 Oct 2019 07:34:49 -0400
parents 2ddc36385d7a
children e77b954f0da5
comparison
equal deleted inserted replaced
3:2ddc36385d7a 4:0850f2dfba13
14 """ 14 """
15 FILETYPES = {'fasta': '^>', 15 FILETYPES = {'fasta': '^>',
16 'fastq': '^@', 16 'fastq': '^@',
17 'tabular': '^.*', 17 'tabular': '^.*',
18 'txt': '^.*', 18 'txt': '^.*',
19 'mgf': '^BEGIN IONS'} 19 'mgf': '^BEGIN IONS',
20 'sdf': '\$\$\$\$',
21 }
20 22
21 23
22 def main(): 24 def main():
23 ps = parser_cli() 25 ps = parser_cli()
24 args = vars(ps.parse_args()) 26 args = vars(ps.parse_args())
57 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) 59 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
58 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") 60 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
59 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + 61 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
60 " the extension of the new files (without a period)") 62 " the extension of the new files (without a period)")
61 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, 63 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
62 choices=["mgf", "fastq", "fasta", "tabular", "txt", "generic"]) 64 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
63 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) 65 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
64 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
65 default = "row", choices = ["col", "row"]) 67 default = "row", choices = ["col", "row"])
66 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " +
67 "(tabular only).") 69 "(tabular only).")
68 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') 70 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
69 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + 71 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
70 "If not provided and args[\"rand\"]==True, then date is used", type=int) 72 "If not provided and args[\"rand\"]==True, then date is used", type=int)
71 parser.add_argument('--numnew', '-n', type=int, default = 1, 73 parser.add_argument('--numnew', '-n', type=int, default = 1,
72 help="Number of output files desired. Not valid for splitting on a column") 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
75 parser.add_argument('--chunksize', '-k', type=int, default = 0,
76 help="Number of records by file. Not valid for splitting on a column")
73 parser.add_argument('--batch', action='store_true', 77 parser.add_argument('--batch', action='store_true',
74 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
75 79 parser.add_argument('--split_after', '-p', action='store_true',
80 help="Split between records after separator (default is before)." +
81 "Only for generic - specific ftypes are always split in the default way")
76 bycol = parser.add_argument_group('If splitting on a column') 82 bycol = parser.add_argument_group('If splitting on a column')
77 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") 83 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
78 bycol.add_argument('--sub', '-s', default = r'\1', 84 bycol.add_argument('--sub', '-s', default = r'\1',
79 help="The regular expression to substitute in for the matched pattern.") 85 help="The regular expression to substitute in for the matched pattern.")
80 bycol.add_argument('--id_column', '-c', default="1", 86 bycol.add_argument('--id_column', '-c', default="1",
100 106
101 def split_by_record(args, in_file, out_dir, top, ftype): 107 def split_by_record(args, in_file, out_dir, top, ftype):
102 # get record separator for given filetype 108 # get record separator for given filetype
103 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) 109 sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
104 110
111 chunksize = args["chunksize"]
105 numnew = args["numnew"] 112 numnew = args["numnew"]
106 113
107 # random division 114 # random division
108 rand = args["rand"] 115 rand = args["rand"]
109 seed = args["seed"] 116 seed = args["seed"]
112 else: 119 else:
113 random.seed() 120 random.seed()
114 121
115 # batched division (maintains order) 122 # batched division (maintains order)
116 batch = args["batch"] 123 batch = args["batch"]
117 # define n_per_file so we don't get a warning about ref before assignment 124
118 n_per_file = math.inf 125
119 if batch: 126 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
127 # define n_per_file so we don't get a warning about ref before assignment
128 n_per_file = math.inf
129
120 # number of records 130 # number of records
121 with open(in_file) as f: 131 with open(in_file) as f:
122 i = 0 132 i = 0
123 for line in f: 133 for line in f:
124 if re.match(sep, line) is not None: 134 if re.match(sep, line) is not None:
125 i+=1 135 i+=1
126 n_records = i + 1 136 n_records = i + 1
127 if top: 137 if top:
128 n_records -= top # don't count the top lines 138 n_records -= top # don't count the top lines
129 139
130 # approx. number of lines per file 140 if chunksize == 0: # i.e. no chunking
131 n_per_file = n_records // numnew 141 # approx. number of lines per file
142 n_per_file = n_records // numnew
143 else:
144 # approx. number of lines per file
145 numnew = n_records // chunksize
146 n_per_file = chunksize
147
148
149
132 150
133 # make new files 151 # make new files
134 # strip extension of old file and add number 152 # strip extension of old file and add number
135 custom_new_file_name = args["file_names"] 153 custom_new_file_name = args["file_names"]
136 custom_new_file_ext = "." + args["file_ext"] 154 custom_new_file_ext = "." + args["file_ext"]
177 else: 195 else:
178 # if is in fresh_files, write header and drop from freshFiles 196 # if is in fresh_files, write header and drop from freshFiles
179 if new_file_counter in fresh_files: 197 if new_file_counter in fresh_files:
180 newfiles[new_file_counter].write(header) 198 newfiles[new_file_counter].write(header)
181 fresh_files.remove(new_file_counter) 199 fresh_files.remove(new_file_counter)
182 200
183 # write record to file 201 if ftype != "sdf" and args["split_after"] == False:
184 newfiles[new_file_counter].write(record) 202 # write record to file
185 203 newfiles[new_file_counter].write(record)
186 # if not the first time through, we assign the new record 204
187 record = line 205 # if not the first time through, we assign the new record
188 206 record = line
207
208 else: # for sdf we want to write the line to the record before starting a new one
209 record += line
210 newfiles[new_file_counter].write(record)
211 record = ""
212
189 # change destination file 213 # change destination file
190 if rand: 214 if rand:
191 new_file_counter = int(math.floor(random.random() * numnew)) 215 new_file_counter = int(math.floor(random.random() * numnew))
192 elif batch: 216 elif batch:
193 # number of records read per file 217 # number of records read per file