1
|
1 #!/usr/bin/perl -w
|
|
2
|
|
3 use Scalar::Util qw(looks_like_number);
|
|
4
|
|
5 $date_abbrev_hash{'jan'} = 'january';
|
|
6 $date_abbrev_hash{'feb'} = 'february';
|
|
7 $date_abbrev_hash{'mar'} = 'march';
|
|
8 $date_abbrev_hash{'apr'} = 'april';
|
|
9 $date_abbrev_hash{'may'} = 'may';
|
|
10 $date_abbrev_hash{'jun'} = 'jun';
|
|
11 $date_abbrev_hash{'jul'} = 'july';
|
|
12 $date_abbrev_hash{'aug'} = 'august';
|
|
13 $date_abbrev_hash{'sep'} = 'september';
|
|
14 $date_abbrev_hash{'oct'} = 'october';
|
|
15 $date_abbrev_hash{'nov'} = 'november';
|
|
16 $date_abbrev_hash{'dec'} = 'december';
|
|
17
|
|
18
|
|
19 sub is_number
|
|
20 {
|
|
21 # use what Perl thinks is a number first
|
|
22 if (looks_like_number($_[0]))
|
|
23 {
|
|
24 # Perl treats infinities as numbers, Excel does not
|
|
25 if ($_[0] =~ /^[+-]*inf/)
|
|
26 {
|
|
27 return 0;
|
|
28 }
|
|
29
|
|
30 return 1;
|
|
31 }
|
|
32
|
|
33 # Perl cannot handle American comma separators within long numbers.
|
|
34 # Excel does, so we have to check for it.
|
|
35 # Excel doesn't handle European dot separators, at least not when it is
|
|
36 # set to the US locale (my test environment). I am going to leave this
|
|
37 # unsupported for now.
|
|
38 #
|
|
39 return ($_[0] =~ /^([+-]?)[0-9]+(,\d\d\d)*([Ee]([+-]?[0-9]+))?$/);
|
|
40 }
|
|
41
|
|
42
|
|
43 sub has_text_month
|
|
44 {
|
|
45 my $date_str = $_[0];
|
|
46 my $abbrev;
|
|
47 my $full;
|
|
48 my $xor;
|
|
49 my $prefix_length;
|
|
50
|
|
51 $candidate = '';
|
|
52 if ($date_str =~ /^([0-9]{1,4}[- \/]*)?([A-Za-z]{3,9})/)
|
|
53 {
|
|
54 $candidate = lc $2;
|
|
55 }
|
|
56
|
|
57 if ($candidate eq '')
|
|
58 {
|
|
59 return 0;
|
|
60 }
|
|
61
|
|
62 $abbrev = substr $candidate, 0, 3;
|
|
63 $full = $date_abbrev_hash{$abbrev};
|
|
64
|
|
65 # first three letters are not the start of a month
|
|
66 if (!defined($full))
|
|
67 {
|
|
68 return 0;
|
|
69 }
|
|
70
|
|
71 # find common prefix
|
|
72 $xor = "$candidate" ^ "$full";
|
|
73 $xor =~ /^\0*/;
|
|
74 $prefix_length = $+[0];
|
|
75
|
|
76 # if the common prefix is the same as the full candidate, it is real
|
|
77 if (length $candidate eq $prefix_length)
|
|
78 {
|
|
79 return 1;
|
|
80 }
|
|
81
|
|
82 return 0;
|
|
83 }
|
|
84
|
|
85 $escape_excel_paranoid_flag = 0;
|
|
86 $escape_sci_flag = 1;
|
|
87 $escape_zeroes_flag = 1;
|
|
88 $escape_dates_flag = 1;
|
|
89
|
|
90 # read in command line arguments
|
|
91 $num_files = 0;
|
|
92 for ($i = 0; $i < @ARGV; $i++)
|
|
93 {
|
|
94 $field = $ARGV[$i];
|
|
95
|
|
96 if ($field =~ /^-/)
|
|
97 {
|
|
98 if ($field eq '--paranoid')
|
|
99 {
|
|
100 if ($escape_excel_paranoid_flag == 0)
|
|
101 {
|
|
102 $escape_excel_paranoid_flag = 1;
|
|
103 }
|
|
104 else
|
|
105 {
|
|
106 $escape_excel_paranoid_flag = 0;
|
|
107 }
|
|
108 }
|
|
109 elsif ($field eq '--no-sci')
|
|
110 {
|
|
111 $escape_sci_flag = 0;
|
|
112 }
|
|
113 elsif ($field eq '--no-zeroes')
|
|
114 {
|
|
115 $escape_zeroes_flag = 0;
|
|
116 }
|
|
117 elsif ($field eq '--no-dates')
|
|
118 {
|
|
119 $escape_dates_flag = 0;
|
|
120 }
|
|
121 else
|
|
122 {
|
|
123 printf "ABORT -- unknown option %s\n", $field;
|
|
124 $syntax_error_flag = 1;
|
|
125 }
|
|
126 }
|
|
127 else
|
|
128 {
|
|
129 if ($num_files == 1)
|
|
130 {
|
|
131 $outname = $field;
|
|
132 $num_files++;
|
|
133 }
|
|
134 if ($num_files == 0)
|
|
135 {
|
|
136 $filename = $field;
|
|
137 $num_files++;
|
|
138 }
|
|
139 }
|
|
140 }
|
|
141
|
|
142 # default to stdin if no filename given
|
|
143 if ($num_files == 0)
|
|
144 {
|
|
145 $filename = '-';
|
|
146 $num_files = 1;
|
|
147 }
|
|
148
|
|
149
|
|
150 # print syntax error message
|
|
151 if ($num_files == 0 || $syntax_error_flag)
|
|
152 {
|
|
153 printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\n";
|
|
154 printf STDERR " Options:\n";
|
|
155 printf STDERR " --no-dates Do not escape text that looks like dates\n";
|
|
156 printf STDERR " --no-sci Do not escape > #E (ex: 12E4) or >11 digit integer parts\n";
|
|
157 printf STDERR " --no-zeroes Do not escape leading zeroes (ie. 012345)\n";
|
|
158 printf STDERR " --paranoid Escape *ALL* non-numeric text\n";
|
|
159 printf STDERR " WARNING -- Excel can take a LONG time to import\n";
|
|
160 printf STDERR " text files where most fields are escaped.\n";
|
|
161 printf STDERR " Copy / Paste Values can become near unusuable....\n";
|
|
162 printf STDERR "\n";
|
|
163 printf STDERR " Input file must be tab-delimited.\n";
|
|
164 printf STDERR " Fields will be stripped of existing =\"\" escapes, enclosing \"\", leading \",\n";
|
|
165 printf STDERR " and leading/trailing spaces, as they may all cause problems.\n";
|
|
166 printf STDERR "\n";
|
|
167 printf STDERR " Defaults to escaping most Excel mis-imported fields.\n";
|
|
168 printf STDERR " Escapes a few extra date-like formats that Excel does not consider dates.\n";
|
|
169 printf STDERR " Please send unhandled mis-imported field examples (other than gene symbols\n";
|
|
170 printf STDERR " with 1-digit scientific notation, such as 2e4) to Eric.Welsh\@moffitt.org.\n";
|
|
171 printf STDERR "\n";
|
|
172 printf STDERR " Copy / Paste Values in Excel, after importing, to de-escape back into text.\n";
|
|
173 exit(1);
|
|
174 }
|
|
175
|
|
176
|
|
177 # output to STDOUT
|
|
178 if ($num_files == 1)
|
|
179 {
|
|
180 *OUTFILE = STDOUT;
|
|
181 }
|
|
182 # output to specified file name
|
|
183 if ($num_files == 2)
|
|
184 {
|
|
185 open OUTFILE, ">$outname" or die "can't open output $outname\n";
|
|
186 }
|
|
187
|
|
188
|
|
189 # read in, escape, and print escaped lines
|
|
190 open INFILE, "$filename" or die "can't open $filename\n";
|
|
191 while(defined($line=<INFILE>))
|
|
192 {
|
|
193 # strip newline characters
|
|
194 $line =~ s/[\r\n]+//g;
|
|
195
|
|
196 @array = split /\t/, $line;
|
|
197
|
|
198 # Strip any leading UTF-8 byte order mark so it won't corrupt the
|
|
199 # first field, since regular Perl I/O is not byte order mark aware.
|
|
200 #
|
|
201 # https://en.wikipedia.org/wiki/Byte_order_mark
|
|
202 #
|
|
203 # Various Microsoft products can emit these and screw things up....
|
|
204 #
|
|
205 for ($i = 0; $i < @array; $i++)
|
|
206 {
|
|
207 $line =~ s/^//;
|
|
208 }
|
|
209
|
|
210 for ($i = 0; $i < @array; $i++)
|
|
211 {
|
|
212 # continue stripping problematic stuff until all has been stripped
|
|
213 do
|
|
214 {
|
|
215 $changed_flag = 0;
|
|
216
|
|
217 # remove pre-existing escapes or start/end double quotes,
|
|
218 # since either messes up ="" escapes
|
|
219 while ($array[$i] =~ s/^\=*\"(.*?)\"$/$1/)
|
|
220 {
|
|
221 $changed_flag = 1;
|
|
222 }
|
|
223
|
|
224 # remove leading ", since they mess up Excel in general
|
|
225 #
|
|
226 # this must be done after "", but before leading/trailing spaces,
|
|
227 # since removing leading/trailing spaces could result in more
|
|
228 # full "" enclosures, which would then be messed up by removing
|
|
229 # only the leading "
|
|
230 #
|
|
231 while ($array[$i] =~ s/^\"//)
|
|
232 {
|
|
233 $changed_flag = 1;
|
|
234 }
|
|
235
|
|
236 # remove leading spaces, since they won't protect long numbers
|
|
237 if ($array[$i] =~ s/^\s+//)
|
|
238 {
|
|
239 $changed_flag = 1;
|
|
240 }
|
|
241
|
|
242 # remove trailing spaces, since they won't protect dates
|
|
243 if ($array[$i] =~ s/\s+$//)
|
|
244 {
|
|
245 $changed_flag = 1;
|
|
246 }
|
|
247 } while ($changed_flag)
|
|
248 }
|
|
249
|
|
250 # escape fields
|
|
251 for ($i = 0; $i < @array; $i++)
|
|
252 {
|
|
253 # Strange but true -- 'text doesn't escape text properly in Excel
|
|
254 # when you try to use it in a text file to import. It will not
|
|
255 # auto-strip the leading ' like it does when you type it in a live
|
|
256 # spreadsheet. "text" doesn't, either. Oddly, ="text" DOES work,
|
|
257 # but an equation containing just a text string and no actual
|
|
258 # equation doesn't make much sense. However, it works, so that's
|
|
259 # what I use here to escape fields into mangle-protected text.
|
|
260
|
|
261 # escape numeric problems
|
|
262 if (is_number($array[$i]))
|
|
263 {
|
|
264 # keep leading zeroes for >1 digit before the decimal point
|
|
265 if ($escape_zeroes_flag && $array[$i] =~ /^([+-]?)0[0-9]/)
|
|
266 {
|
|
267 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
268 }
|
|
269
|
|
270 # Escape scientific notation with >= 2 digits before the E,
|
|
271 # since they are likely accessions or plate/well identifiers.
|
|
272 #
|
|
273 # Also escape numbers with >11 digits before the decimal point.
|
|
274 # >11 is when it displays scientific notation in General format,
|
|
275 # which can result in corruption when saved to text.
|
|
276 # >15 would be the limit at which it loses precision internally.
|
|
277 #
|
|
278 # NOTE -- if there is a + or - at the beginning, this rule
|
|
279 # will not trigger. Undecided if this is desired or not.
|
|
280 # Probably desired behavior, since +/- would indicate that
|
|
281 # it is probably a true number, and not an accession or
|
|
282 # plate/well identifier.
|
|
283 #
|
|
284 elsif ($escape_sci_flag)
|
|
285 {
|
|
286 # strip commas before counting digits
|
|
287 $temp = $array[$i];
|
|
288 $temp =~ s/\,//g;
|
|
289
|
|
290 if ($temp =~ /^([1-9][0-9]{11,}|[0-9]{2,}[eE])/)
|
|
291 {
|
|
292 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
293 }
|
|
294 }
|
|
295 }
|
|
296 # escape all text if paranoid
|
|
297 elsif ($escape_excel_paranoid_flag)
|
|
298 {
|
|
299 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
300 }
|
|
301 # escape dates
|
|
302 elsif ($escape_dates_flag)
|
|
303 {
|
|
304 # escape single quote at beginning of line
|
|
305 if ($array[$i] =~ /^'/)
|
|
306 {
|
|
307 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
308 }
|
|
309
|
|
310 # prevent conversion into formulas
|
|
311 elsif ($array[$i] =~ /^\=/)
|
|
312 {
|
|
313 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
314 }
|
|
315 # Excel is smart enough to treat all +/- as not an equation
|
|
316 # but, otherwise, it will convert anything starting with +/-
|
|
317 # into "#NAME?" as a failed invalid equation
|
|
318 elsif ($array[$i] =~ /^[+-]/ && !($array[$i] =~ /^[+-]+$/))
|
|
319 {
|
|
320 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
321 }
|
|
322
|
|
323 # check for time and/or date stuff
|
|
324 else
|
|
325 {
|
|
326 $time = '';
|
|
327 $date = '';
|
|
328
|
|
329 # attempt to guess at how excel might autoconvert into time
|
|
330 # allow letter/punctuation at end if it could be part of a date
|
|
331 # it would get too complicated to handle date-ness correctly,
|
|
332 # since I'm already resorting to negative look-ahead
|
|
333 if ($array[$i] =~ /\b(([0-9]+\s+(AM|PM|A|P)|[0-9]+:[0-9]+(:[0-9.]+)?)(\s+(AM|PM|A|P))?)(?!([^-\/, 0-9ADFJMNOSadfjmnos]))/)
|
|
334 {
|
|
335 $time = $1;
|
|
336 }
|
|
337
|
|
338 $strip_time = $array[$i];
|
|
339 if ($time =~ /\S/)
|
|
340 {
|
|
341 $strip_time =~ s/\Q$time\E//;
|
|
342 $strip_time =~ s/^\s+//;
|
|
343 $strip_time =~ s/\s+$//
|
|
344 }
|
|
345
|
|
346 # text date, month in the middle
|
|
347 if ($strip_time =~ /\b([0-9]{1,4}[- \/]*Jan[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
348 $strip_time =~ /\b([0-9]{1,4}[- \/]*Feb[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
349 $strip_time =~ /\b([0-9]{1,4}[- \/]*Mar[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
350 $strip_time =~ /\b([0-9]{1,4}[- \/]*Apr[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
351 $strip_time =~ /\b([0-9]{1,4}[- \/]*May[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
352 $strip_time =~ /\b([0-9]{1,4}[- \/]*Jun[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
353 $strip_time =~ /\b([0-9]{1,4}[- \/]*Jul[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
354 $strip_time =~ /\b([0-9]{1,4}[- \/]*Aug[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
355 $strip_time =~ /\b([0-9]{1,4}[- \/]*Sep[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
356 $strip_time =~ /\b([0-9]{1,4}[- \/]*Oct[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
357 $strip_time =~ /\b([0-9]{1,4}[- \/]*Nov[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
|
|
358 $strip_time =~ /\b([0-9]{1,4}[- \/]*Dec[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i)
|
|
359 {
|
|
360 $temp = $1;
|
|
361
|
|
362 if (has_text_month($temp))
|
|
363 {
|
|
364 $date = $temp;
|
|
365 }
|
|
366 }
|
|
367
|
|
368 # text date, month first
|
|
369 elsif ($strip_time =~ /\b(Jan[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
370 $strip_time =~ /\b(Feb[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
371 $strip_time =~ /\b(Mar[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
372 $strip_time =~ /\b(Apr[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
373 $strip_time =~ /\b(May[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
374 $strip_time =~ /\b(Jun[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
375 $strip_time =~ /\b(Jul[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
376 $strip_time =~ /\b(Aug[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
377 $strip_time =~ /\b(Sep[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
378 $strip_time =~ /\b(Oct[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
379 $strip_time =~ /\b(Nov[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
|
|
380 $strip_time =~ /\b(Dec[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i)
|
|
381 {
|
|
382 $temp = $1;
|
|
383
|
|
384 if (has_text_month($temp))
|
|
385 {
|
|
386 $date = $temp;
|
|
387 }
|
|
388 }
|
|
389
|
|
390 # possibly a numeric date
|
|
391 elsif ($strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,2})\b/ ||
|
|
392 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4}[- \/]+[0-9]{1,2})\b/ ||
|
|
393 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
|
|
394 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
|
|
395 $strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2})\b/)
|
|
396 {
|
|
397 $date = $1;
|
|
398 }
|
|
399
|
|
400 # be sure that date and time anchor the ends
|
|
401 # mix of time and date
|
|
402 if ($time =~ /\S/ && $date =~ /\S/)
|
|
403 {
|
|
404 if ($array[$i] =~ /^\Q$time\E(.*)\Q$date\E$/ ||
|
|
405 $array[$i] =~ /^\Q$date\E(.*)\Q$time\E$/)
|
|
406 {
|
|
407 $middle = $1;
|
|
408
|
|
409 # allow blank
|
|
410 # allow for purely whitespace
|
|
411 # allow for a single hyphen, slash, comma
|
|
412 # allow for multiple spaces before and/or after
|
|
413 if ($middle eq '' ||
|
|
414 $middle =~ /^\s+$/ ||
|
|
415 $middle =~ /^\s*[-\/,]\s*$/)
|
|
416 {
|
|
417 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
418 }
|
|
419 }
|
|
420 }
|
|
421 # only time
|
|
422 elsif ($time =~ /\S/)
|
|
423 {
|
|
424 if ($array[$i] =~ /^\Q$time\E$/)
|
|
425 {
|
|
426 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
427 }
|
|
428 }
|
|
429 # only date
|
|
430 elsif ($date =~ /\S/)
|
|
431 {
|
|
432 if ($array[$i] =~ /^\Q$date\E$/)
|
|
433 {
|
|
434 $array[$i] = sprintf "=\"%s\"", $array[$i];
|
|
435 }
|
|
436 }
|
|
437 }
|
|
438 }
|
|
439 }
|
|
440
|
|
441 # make the new escaped line
|
|
442 $line_escaped = join "\t", @array;
|
|
443
|
|
444 # print it
|
|
445 print OUTFILE "$line_escaped\n";
|
|
446 }
|
|
447 close INFILE;
|
|
448
|
|
449 close OUTFILE;
|