comparison escape_excel.pl @ 1:7726adcf91c6 draft

Uploaded Escape Excel Perl script
author pstew
date Fri, 17 Feb 2017 16:38:12 -0500
parents
children
comparison
equal deleted inserted replaced
0:45f9b77eda87 1:7726adcf91c6
1 #!/usr/bin/perl -w
2
3 use Scalar::Util qw(looks_like_number);
4
5 $date_abbrev_hash{'jan'} = 'january';
6 $date_abbrev_hash{'feb'} = 'february';
7 $date_abbrev_hash{'mar'} = 'march';
8 $date_abbrev_hash{'apr'} = 'april';
9 $date_abbrev_hash{'may'} = 'may';
10 $date_abbrev_hash{'jun'} = 'jun';
11 $date_abbrev_hash{'jul'} = 'july';
12 $date_abbrev_hash{'aug'} = 'august';
13 $date_abbrev_hash{'sep'} = 'september';
14 $date_abbrev_hash{'oct'} = 'october';
15 $date_abbrev_hash{'nov'} = 'november';
16 $date_abbrev_hash{'dec'} = 'december';
17
18
19 sub is_number
20 {
21 # use what Perl thinks is a number first
22 if (looks_like_number($_[0]))
23 {
24 # Perl treats infinities as numbers, Excel does not
25 if ($_[0] =~ /^[+-]*inf/)
26 {
27 return 0;
28 }
29
30 return 1;
31 }
32
33 # Perl cannot handle American comma separators within long numbers.
34 # Excel does, so we have to check for it.
35 # Excel doesn't handle European dot separators, at least not when it is
36 # set to the US locale (my test environment). I am going to leave this
37 # unsupported for now.
38 #
39 return ($_[0] =~ /^([+-]?)[0-9]+(,\d\d\d)*([Ee]([+-]?[0-9]+))?$/);
40 }
41
42
43 sub has_text_month
44 {
45 my $date_str = $_[0];
46 my $abbrev;
47 my $full;
48 my $xor;
49 my $prefix_length;
50
51 $candidate = '';
52 if ($date_str =~ /^([0-9]{1,4}[- \/]*)?([A-Za-z]{3,9})/)
53 {
54 $candidate = lc $2;
55 }
56
57 if ($candidate eq '')
58 {
59 return 0;
60 }
61
62 $abbrev = substr $candidate, 0, 3;
63 $full = $date_abbrev_hash{$abbrev};
64
65 # first three letters are not the start of a month
66 if (!defined($full))
67 {
68 return 0;
69 }
70
71 # find common prefix
72 $xor = "$candidate" ^ "$full";
73 $xor =~ /^\0*/;
74 $prefix_length = $+[0];
75
76 # if the common prefix is the same as the full candidate, it is real
77 if (length $candidate eq $prefix_length)
78 {
79 return 1;
80 }
81
82 return 0;
83 }
84
85 $escape_excel_paranoid_flag = 0;
86 $escape_sci_flag = 1;
87 $escape_zeroes_flag = 1;
88 $escape_dates_flag = 1;
89
90 # read in command line arguments
91 $num_files = 0;
92 for ($i = 0; $i < @ARGV; $i++)
93 {
94 $field = $ARGV[$i];
95
96 if ($field =~ /^-/)
97 {
98 if ($field eq '--paranoid')
99 {
100 if ($escape_excel_paranoid_flag == 0)
101 {
102 $escape_excel_paranoid_flag = 1;
103 }
104 else
105 {
106 $escape_excel_paranoid_flag = 0;
107 }
108 }
109 elsif ($field eq '--no-sci')
110 {
111 $escape_sci_flag = 0;
112 }
113 elsif ($field eq '--no-zeroes')
114 {
115 $escape_zeroes_flag = 0;
116 }
117 elsif ($field eq '--no-dates')
118 {
119 $escape_dates_flag = 0;
120 }
121 else
122 {
123 printf "ABORT -- unknown option %s\n", $field;
124 $syntax_error_flag = 1;
125 }
126 }
127 else
128 {
129 if ($num_files == 1)
130 {
131 $outname = $field;
132 $num_files++;
133 }
134 if ($num_files == 0)
135 {
136 $filename = $field;
137 $num_files++;
138 }
139 }
140 }
141
142 # default to stdin if no filename given
143 if ($num_files == 0)
144 {
145 $filename = '-';
146 $num_files = 1;
147 }
148
149
150 # print syntax error message
151 if ($num_files == 0 || $syntax_error_flag)
152 {
153 printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\n";
154 printf STDERR " Options:\n";
155 printf STDERR " --no-dates Do not escape text that looks like dates\n";
156 printf STDERR " --no-sci Do not escape > #E (ex: 12E4) or >11 digit integer parts\n";
157 printf STDERR " --no-zeroes Do not escape leading zeroes (ie. 012345)\n";
158 printf STDERR " --paranoid Escape *ALL* non-numeric text\n";
159 printf STDERR " WARNING -- Excel can take a LONG time to import\n";
160 printf STDERR " text files where most fields are escaped.\n";
161 printf STDERR " Copy / Paste Values can become near unusuable....\n";
162 printf STDERR "\n";
163 printf STDERR " Input file must be tab-delimited.\n";
164 printf STDERR " Fields will be stripped of existing =\"\" escapes, enclosing \"\", leading \",\n";
165 printf STDERR " and leading/trailing spaces, as they may all cause problems.\n";
166 printf STDERR "\n";
167 printf STDERR " Defaults to escaping most Excel mis-imported fields.\n";
168 printf STDERR " Escapes a few extra date-like formats that Excel does not consider dates.\n";
169 printf STDERR " Please send unhandled mis-imported field examples (other than gene symbols\n";
170 printf STDERR " with 1-digit scientific notation, such as 2e4) to Eric.Welsh\@moffitt.org.\n";
171 printf STDERR "\n";
172 printf STDERR " Copy / Paste Values in Excel, after importing, to de-escape back into text.\n";
173 exit(1);
174 }
175
176
177 # output to STDOUT
178 if ($num_files == 1)
179 {
180 *OUTFILE = STDOUT;
181 }
182 # output to specified file name
183 if ($num_files == 2)
184 {
185 open OUTFILE, ">$outname" or die "can't open output $outname\n";
186 }
187
188
189 # read in, escape, and print escaped lines
190 open INFILE, "$filename" or die "can't open $filename\n";
191 while(defined($line=<INFILE>))
192 {
193 # strip newline characters
194 $line =~ s/[\r\n]+//g;
195
196 @array = split /\t/, $line;
197
198 # Strip any leading UTF-8 byte order mark so it won't corrupt the
199 # first field, since regular Perl I/O is not byte order mark aware.
200 #
201 # https://en.wikipedia.org/wiki/Byte_order_mark
202 #
203 # Various Microsoft products can emit these and screw things up....
204 #
205 for ($i = 0; $i < @array; $i++)
206 {
207 $line =~ s/^//;
208 }
209
210 for ($i = 0; $i < @array; $i++)
211 {
212 # continue stripping problematic stuff until all has been stripped
213 do
214 {
215 $changed_flag = 0;
216
217 # remove pre-existing escapes or start/end double quotes,
218 # since either messes up ="" escapes
219 while ($array[$i] =~ s/^\=*\"(.*?)\"$/$1/)
220 {
221 $changed_flag = 1;
222 }
223
224 # remove leading ", since they mess up Excel in general
225 #
226 # this must be done after "", but before leading/trailing spaces,
227 # since removing leading/trailing spaces could result in more
228 # full "" enclosures, which would then be messed up by removing
229 # only the leading "
230 #
231 while ($array[$i] =~ s/^\"//)
232 {
233 $changed_flag = 1;
234 }
235
236 # remove leading spaces, since they won't protect long numbers
237 if ($array[$i] =~ s/^\s+//)
238 {
239 $changed_flag = 1;
240 }
241
242 # remove trailing spaces, since they won't protect dates
243 if ($array[$i] =~ s/\s+$//)
244 {
245 $changed_flag = 1;
246 }
247 } while ($changed_flag)
248 }
249
250 # escape fields
251 for ($i = 0; $i < @array; $i++)
252 {
253 # Strange but true -- 'text doesn't escape text properly in Excel
254 # when you try to use it in a text file to import. It will not
255 # auto-strip the leading ' like it does when you type it in a live
256 # spreadsheet. "text" doesn't, either. Oddly, ="text" DOES work,
257 # but an equation containing just a text string and no actual
258 # equation doesn't make much sense. However, it works, so that's
259 # what I use here to escape fields into mangle-protected text.
260
261 # escape numeric problems
262 if (is_number($array[$i]))
263 {
264 # keep leading zeroes for >1 digit before the decimal point
265 if ($escape_zeroes_flag && $array[$i] =~ /^([+-]?)0[0-9]/)
266 {
267 $array[$i] = sprintf "=\"%s\"", $array[$i];
268 }
269
270 # Escape scientific notation with >= 2 digits before the E,
271 # since they are likely accessions or plate/well identifiers.
272 #
273 # Also escape numbers with >11 digits before the decimal point.
274 # >11 is when it displays scientific notation in General format,
275 # which can result in corruption when saved to text.
276 # >15 would be the limit at which it loses precision internally.
277 #
278 # NOTE -- if there is a + or - at the beginning, this rule
279 # will not trigger. Undecided if this is desired or not.
280 # Probably desired behavior, since +/- would indicate that
281 # it is probably a true number, and not an accession or
282 # plate/well identifier.
283 #
284 elsif ($escape_sci_flag)
285 {
286 # strip commas before counting digits
287 $temp = $array[$i];
288 $temp =~ s/\,//g;
289
290 if ($temp =~ /^([1-9][0-9]{11,}|[0-9]{2,}[eE])/)
291 {
292 $array[$i] = sprintf "=\"%s\"", $array[$i];
293 }
294 }
295 }
296 # escape all text if paranoid
297 elsif ($escape_excel_paranoid_flag)
298 {
299 $array[$i] = sprintf "=\"%s\"", $array[$i];
300 }
301 # escape dates
302 elsif ($escape_dates_flag)
303 {
304 # escape single quote at beginning of line
305 if ($array[$i] =~ /^'/)
306 {
307 $array[$i] = sprintf "=\"%s\"", $array[$i];
308 }
309
310 # prevent conversion into formulas
311 elsif ($array[$i] =~ /^\=/)
312 {
313 $array[$i] = sprintf "=\"%s\"", $array[$i];
314 }
315 # Excel is smart enough to treat all +/- as not an equation
316 # but, otherwise, it will convert anything starting with +/-
317 # into "#NAME?" as a failed invalid equation
318 elsif ($array[$i] =~ /^[+-]/ && !($array[$i] =~ /^[+-]+$/))
319 {
320 $array[$i] = sprintf "=\"%s\"", $array[$i];
321 }
322
323 # check for time and/or date stuff
324 else
325 {
326 $time = '';
327 $date = '';
328
329 # attempt to guess at how excel might autoconvert into time
330 # allow letter/punctuation at end if it could be part of a date
331 # it would get too complicated to handle date-ness correctly,
332 # since I'm already resorting to negative look-ahead
333 if ($array[$i] =~ /\b(([0-9]+\s+(AM|PM|A|P)|[0-9]+:[0-9]+(:[0-9.]+)?)(\s+(AM|PM|A|P))?)(?!([^-\/, 0-9ADFJMNOSadfjmnos]))/)
334 {
335 $time = $1;
336 }
337
338 $strip_time = $array[$i];
339 if ($time =~ /\S/)
340 {
341 $strip_time =~ s/\Q$time\E//;
342 $strip_time =~ s/^\s+//;
343 $strip_time =~ s/\s+$//
344 }
345
346 # text date, month in the middle
347 if ($strip_time =~ /\b([0-9]{1,4}[- \/]*Jan[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
348 $strip_time =~ /\b([0-9]{1,4}[- \/]*Feb[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
349 $strip_time =~ /\b([0-9]{1,4}[- \/]*Mar[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
350 $strip_time =~ /\b([0-9]{1,4}[- \/]*Apr[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
351 $strip_time =~ /\b([0-9]{1,4}[- \/]*May[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
352 $strip_time =~ /\b([0-9]{1,4}[- \/]*Jun[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
353 $strip_time =~ /\b([0-9]{1,4}[- \/]*Jul[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
354 $strip_time =~ /\b([0-9]{1,4}[- \/]*Aug[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
355 $strip_time =~ /\b([0-9]{1,4}[- \/]*Sep[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
356 $strip_time =~ /\b([0-9]{1,4}[- \/]*Oct[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
357 $strip_time =~ /\b([0-9]{1,4}[- \/]*Nov[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
358 $strip_time =~ /\b([0-9]{1,4}[- \/]*Dec[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i)
359 {
360 $temp = $1;
361
362 if (has_text_month($temp))
363 {
364 $date = $temp;
365 }
366 }
367
368 # text date, month first
369 elsif ($strip_time =~ /\b(Jan[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
370 $strip_time =~ /\b(Feb[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
371 $strip_time =~ /\b(Mar[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
372 $strip_time =~ /\b(Apr[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
373 $strip_time =~ /\b(May[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
374 $strip_time =~ /\b(Jun[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
375 $strip_time =~ /\b(Jul[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
376 $strip_time =~ /\b(Aug[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
377 $strip_time =~ /\b(Sep[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
378 $strip_time =~ /\b(Oct[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
379 $strip_time =~ /\b(Nov[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
380 $strip_time =~ /\b(Dec[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i)
381 {
382 $temp = $1;
383
384 if (has_text_month($temp))
385 {
386 $date = $temp;
387 }
388 }
389
390 # possibly a numeric date
391 elsif ($strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,2})\b/ ||
392 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4}[- \/]+[0-9]{1,2})\b/ ||
393 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
394 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
395 $strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2})\b/)
396 {
397 $date = $1;
398 }
399
400 # be sure that date and time anchor the ends
401 # mix of time and date
402 if ($time =~ /\S/ && $date =~ /\S/)
403 {
404 if ($array[$i] =~ /^\Q$time\E(.*)\Q$date\E$/ ||
405 $array[$i] =~ /^\Q$date\E(.*)\Q$time\E$/)
406 {
407 $middle = $1;
408
409 # allow blank
410 # allow for purely whitespace
411 # allow for a single hyphen, slash, comma
412 # allow for multiple spaces before and/or after
413 if ($middle eq '' ||
414 $middle =~ /^\s+$/ ||
415 $middle =~ /^\s*[-\/,]\s*$/)
416 {
417 $array[$i] = sprintf "=\"%s\"", $array[$i];
418 }
419 }
420 }
421 # only time
422 elsif ($time =~ /\S/)
423 {
424 if ($array[$i] =~ /^\Q$time\E$/)
425 {
426 $array[$i] = sprintf "=\"%s\"", $array[$i];
427 }
428 }
429 # only date
430 elsif ($date =~ /\S/)
431 {
432 if ($array[$i] =~ /^\Q$date\E$/)
433 {
434 $array[$i] = sprintf "=\"%s\"", $array[$i];
435 }
436 }
437 }
438 }
439 }
440
441 # make the new escaped line
442 $line_escaped = join "\t", @array;
443
444 # print it
445 print OUTFILE "$line_escaped\n";
446 }
447 close INFILE;
448
449 close OUTFILE;