Mercurial > repos > pstew > escape_excel
comparison escape_excel.pl @ 1:7726adcf91c6 draft
Uploaded Escape Excel Perl script
author | pstew |
---|---|
date | Fri, 17 Feb 2017 16:38:12 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:45f9b77eda87 | 1:7726adcf91c6 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 use Scalar::Util qw(looks_like_number); | |
4 | |
5 $date_abbrev_hash{'jan'} = 'january'; | |
6 $date_abbrev_hash{'feb'} = 'february'; | |
7 $date_abbrev_hash{'mar'} = 'march'; | |
8 $date_abbrev_hash{'apr'} = 'april'; | |
9 $date_abbrev_hash{'may'} = 'may'; | |
10 $date_abbrev_hash{'jun'} = 'jun'; | |
11 $date_abbrev_hash{'jul'} = 'july'; | |
12 $date_abbrev_hash{'aug'} = 'august'; | |
13 $date_abbrev_hash{'sep'} = 'september'; | |
14 $date_abbrev_hash{'oct'} = 'october'; | |
15 $date_abbrev_hash{'nov'} = 'november'; | |
16 $date_abbrev_hash{'dec'} = 'december'; | |
17 | |
18 | |
19 sub is_number | |
20 { | |
21 # use what Perl thinks is a number first | |
22 if (looks_like_number($_[0])) | |
23 { | |
24 # Perl treats infinities as numbers, Excel does not | |
25 if ($_[0] =~ /^[+-]*inf/) | |
26 { | |
27 return 0; | |
28 } | |
29 | |
30 return 1; | |
31 } | |
32 | |
33 # Perl cannot handle American comma separators within long numbers. | |
34 # Excel does, so we have to check for it. | |
35 # Excel doesn't handle European dot separators, at least not when it is | |
36 # set to the US locale (my test environment). I am going to leave this | |
37 # unsupported for now. | |
38 # | |
39 return ($_[0] =~ /^([+-]?)[0-9]+(,\d\d\d)*([Ee]([+-]?[0-9]+))?$/); | |
40 } | |
41 | |
42 | |
43 sub has_text_month | |
44 { | |
45 my $date_str = $_[0]; | |
46 my $abbrev; | |
47 my $full; | |
48 my $xor; | |
49 my $prefix_length; | |
50 | |
51 $candidate = ''; | |
52 if ($date_str =~ /^([0-9]{1,4}[- \/]*)?([A-Za-z]{3,9})/) | |
53 { | |
54 $candidate = lc $2; | |
55 } | |
56 | |
57 if ($candidate eq '') | |
58 { | |
59 return 0; | |
60 } | |
61 | |
62 $abbrev = substr $candidate, 0, 3; | |
63 $full = $date_abbrev_hash{$abbrev}; | |
64 | |
65 # first three letters are not the start of a month | |
66 if (!defined($full)) | |
67 { | |
68 return 0; | |
69 } | |
70 | |
71 # find common prefix | |
72 $xor = "$candidate" ^ "$full"; | |
73 $xor =~ /^\0*/; | |
74 $prefix_length = $+[0]; | |
75 | |
76 # if the common prefix is the same as the full candidate, it is real | |
77 if (length $candidate eq $prefix_length) | |
78 { | |
79 return 1; | |
80 } | |
81 | |
82 return 0; | |
83 } | |
84 | |
85 $escape_excel_paranoid_flag = 0; | |
86 $escape_sci_flag = 1; | |
87 $escape_zeroes_flag = 1; | |
88 $escape_dates_flag = 1; | |
89 | |
90 # read in command line arguments | |
91 $num_files = 0; | |
92 for ($i = 0; $i < @ARGV; $i++) | |
93 { | |
94 $field = $ARGV[$i]; | |
95 | |
96 if ($field =~ /^-/) | |
97 { | |
98 if ($field eq '--paranoid') | |
99 { | |
100 if ($escape_excel_paranoid_flag == 0) | |
101 { | |
102 $escape_excel_paranoid_flag = 1; | |
103 } | |
104 else | |
105 { | |
106 $escape_excel_paranoid_flag = 0; | |
107 } | |
108 } | |
109 elsif ($field eq '--no-sci') | |
110 { | |
111 $escape_sci_flag = 0; | |
112 } | |
113 elsif ($field eq '--no-zeroes') | |
114 { | |
115 $escape_zeroes_flag = 0; | |
116 } | |
117 elsif ($field eq '--no-dates') | |
118 { | |
119 $escape_dates_flag = 0; | |
120 } | |
121 else | |
122 { | |
123 printf "ABORT -- unknown option %s\n", $field; | |
124 $syntax_error_flag = 1; | |
125 } | |
126 } | |
127 else | |
128 { | |
129 if ($num_files == 1) | |
130 { | |
131 $outname = $field; | |
132 $num_files++; | |
133 } | |
134 if ($num_files == 0) | |
135 { | |
136 $filename = $field; | |
137 $num_files++; | |
138 } | |
139 } | |
140 } | |
141 | |
142 # default to stdin if no filename given | |
143 if ($num_files == 0) | |
144 { | |
145 $filename = '-'; | |
146 $num_files = 1; | |
147 } | |
148 | |
149 | |
150 # print syntax error message | |
151 if ($num_files == 0 || $syntax_error_flag) | |
152 { | |
153 printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\n"; | |
154 printf STDERR " Options:\n"; | |
155 printf STDERR " --no-dates Do not escape text that looks like dates\n"; | |
156 printf STDERR " --no-sci Do not escape > #E (ex: 12E4) or >11 digit integer parts\n"; | |
157 printf STDERR " --no-zeroes Do not escape leading zeroes (ie. 012345)\n"; | |
158 printf STDERR " --paranoid Escape *ALL* non-numeric text\n"; | |
159 printf STDERR " WARNING -- Excel can take a LONG time to import\n"; | |
160 printf STDERR " text files where most fields are escaped.\n"; | |
161 printf STDERR " Copy / Paste Values can become near unusuable....\n"; | |
162 printf STDERR "\n"; | |
163 printf STDERR " Input file must be tab-delimited.\n"; | |
164 printf STDERR " Fields will be stripped of existing =\"\" escapes, enclosing \"\", leading \",\n"; | |
165 printf STDERR " and leading/trailing spaces, as they may all cause problems.\n"; | |
166 printf STDERR "\n"; | |
167 printf STDERR " Defaults to escaping most Excel mis-imported fields.\n"; | |
168 printf STDERR " Escapes a few extra date-like formats that Excel does not consider dates.\n"; | |
169 printf STDERR " Please send unhandled mis-imported field examples (other than gene symbols\n"; | |
170 printf STDERR " with 1-digit scientific notation, such as 2e4) to Eric.Welsh\@moffitt.org.\n"; | |
171 printf STDERR "\n"; | |
172 printf STDERR " Copy / Paste Values in Excel, after importing, to de-escape back into text.\n"; | |
173 exit(1); | |
174 } | |
175 | |
176 | |
177 # output to STDOUT | |
178 if ($num_files == 1) | |
179 { | |
180 *OUTFILE = STDOUT; | |
181 } | |
182 # output to specified file name | |
183 if ($num_files == 2) | |
184 { | |
185 open OUTFILE, ">$outname" or die "can't open output $outname\n"; | |
186 } | |
187 | |
188 | |
189 # read in, escape, and print escaped lines | |
190 open INFILE, "$filename" or die "can't open $filename\n"; | |
191 while(defined($line=<INFILE>)) | |
192 { | |
193 # strip newline characters | |
194 $line =~ s/[\r\n]+//g; | |
195 | |
196 @array = split /\t/, $line; | |
197 | |
198 # Strip any leading UTF-8 byte order mark so it won't corrupt the | |
199 # first field, since regular Perl I/O is not byte order mark aware. | |
200 # | |
201 # https://en.wikipedia.org/wiki/Byte_order_mark | |
202 # | |
203 # Various Microsoft products can emit these and screw things up.... | |
204 # | |
205 for ($i = 0; $i < @array; $i++) | |
206 { | |
207 $line =~ s/^//; | |
208 } | |
209 | |
210 for ($i = 0; $i < @array; $i++) | |
211 { | |
212 # continue stripping problematic stuff until all has been stripped | |
213 do | |
214 { | |
215 $changed_flag = 0; | |
216 | |
217 # remove pre-existing escapes or start/end double quotes, | |
218 # since either messes up ="" escapes | |
219 while ($array[$i] =~ s/^\=*\"(.*?)\"$/$1/) | |
220 { | |
221 $changed_flag = 1; | |
222 } | |
223 | |
224 # remove leading ", since they mess up Excel in general | |
225 # | |
226 # this must be done after "", but before leading/trailing spaces, | |
227 # since removing leading/trailing spaces could result in more | |
228 # full "" enclosures, which would then be messed up by removing | |
229 # only the leading " | |
230 # | |
231 while ($array[$i] =~ s/^\"//) | |
232 { | |
233 $changed_flag = 1; | |
234 } | |
235 | |
236 # remove leading spaces, since they won't protect long numbers | |
237 if ($array[$i] =~ s/^\s+//) | |
238 { | |
239 $changed_flag = 1; | |
240 } | |
241 | |
242 # remove trailing spaces, since they won't protect dates | |
243 if ($array[$i] =~ s/\s+$//) | |
244 { | |
245 $changed_flag = 1; | |
246 } | |
247 } while ($changed_flag) | |
248 } | |
249 | |
250 # escape fields | |
251 for ($i = 0; $i < @array; $i++) | |
252 { | |
253 # Strange but true -- 'text doesn't escape text properly in Excel | |
254 # when you try to use it in a text file to import. It will not | |
255 # auto-strip the leading ' like it does when you type it in a live | |
256 # spreadsheet. "text" doesn't, either. Oddly, ="text" DOES work, | |
257 # but an equation containing just a text string and no actual | |
258 # equation doesn't make much sense. However, it works, so that's | |
259 # what I use here to escape fields into mangle-protected text. | |
260 | |
261 # escape numeric problems | |
262 if (is_number($array[$i])) | |
263 { | |
264 # keep leading zeroes for >1 digit before the decimal point | |
265 if ($escape_zeroes_flag && $array[$i] =~ /^([+-]?)0[0-9]/) | |
266 { | |
267 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
268 } | |
269 | |
270 # Escape scientific notation with >= 2 digits before the E, | |
271 # since they are likely accessions or plate/well identifiers. | |
272 # | |
273 # Also escape numbers with >11 digits before the decimal point. | |
274 # >11 is when it displays scientific notation in General format, | |
275 # which can result in corruption when saved to text. | |
276 # >15 would be the limit at which it loses precision internally. | |
277 # | |
278 # NOTE -- if there is a + or - at the beginning, this rule | |
279 # will not trigger. Undecided if this is desired or not. | |
280 # Probably desired behavior, since +/- would indicate that | |
281 # it is probably a true number, and not an accession or | |
282 # plate/well identifier. | |
283 # | |
284 elsif ($escape_sci_flag) | |
285 { | |
286 # strip commas before counting digits | |
287 $temp = $array[$i]; | |
288 $temp =~ s/\,//g; | |
289 | |
290 if ($temp =~ /^([1-9][0-9]{11,}|[0-9]{2,}[eE])/) | |
291 { | |
292 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
293 } | |
294 } | |
295 } | |
296 # escape all text if paranoid | |
297 elsif ($escape_excel_paranoid_flag) | |
298 { | |
299 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
300 } | |
301 # escape dates | |
302 elsif ($escape_dates_flag) | |
303 { | |
304 # escape single quote at beginning of line | |
305 if ($array[$i] =~ /^'/) | |
306 { | |
307 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
308 } | |
309 | |
310 # prevent conversion into formulas | |
311 elsif ($array[$i] =~ /^\=/) | |
312 { | |
313 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
314 } | |
315 # Excel is smart enough to treat all +/- as not an equation | |
316 # but, otherwise, it will convert anything starting with +/- | |
317 # into "#NAME?" as a failed invalid equation | |
318 elsif ($array[$i] =~ /^[+-]/ && !($array[$i] =~ /^[+-]+$/)) | |
319 { | |
320 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
321 } | |
322 | |
323 # check for time and/or date stuff | |
324 else | |
325 { | |
326 $time = ''; | |
327 $date = ''; | |
328 | |
329 # attempt to guess at how excel might autoconvert into time | |
330 # allow letter/punctuation at end if it could be part of a date | |
331 # it would get too complicated to handle date-ness correctly, | |
332 # since I'm already resorting to negative look-ahead | |
333 if ($array[$i] =~ /\b(([0-9]+\s+(AM|PM|A|P)|[0-9]+:[0-9]+(:[0-9.]+)?)(\s+(AM|PM|A|P))?)(?!([^-\/, 0-9ADFJMNOSadfjmnos]))/) | |
334 { | |
335 $time = $1; | |
336 } | |
337 | |
338 $strip_time = $array[$i]; | |
339 if ($time =~ /\S/) | |
340 { | |
341 $strip_time =~ s/\Q$time\E//; | |
342 $strip_time =~ s/^\s+//; | |
343 $strip_time =~ s/\s+$// | |
344 } | |
345 | |
346 # text date, month in the middle | |
347 if ($strip_time =~ /\b([0-9]{1,4}[- \/]*Jan[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
348 $strip_time =~ /\b([0-9]{1,4}[- \/]*Feb[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
349 $strip_time =~ /\b([0-9]{1,4}[- \/]*Mar[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
350 $strip_time =~ /\b([0-9]{1,4}[- \/]*Apr[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
351 $strip_time =~ /\b([0-9]{1,4}[- \/]*May[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
352 $strip_time =~ /\b([0-9]{1,4}[- \/]*Jun[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
353 $strip_time =~ /\b([0-9]{1,4}[- \/]*Jul[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
354 $strip_time =~ /\b([0-9]{1,4}[- \/]*Aug[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
355 $strip_time =~ /\b([0-9]{1,4}[- \/]*Sep[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
356 $strip_time =~ /\b([0-9]{1,4}[- \/]*Oct[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
357 $strip_time =~ /\b([0-9]{1,4}[- \/]*Nov[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || | |
358 $strip_time =~ /\b([0-9]{1,4}[- \/]*Dec[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i) | |
359 { | |
360 $temp = $1; | |
361 | |
362 if (has_text_month($temp)) | |
363 { | |
364 $date = $temp; | |
365 } | |
366 } | |
367 | |
368 # text date, month first | |
369 elsif ($strip_time =~ /\b(Jan[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
370 $strip_time =~ /\b(Feb[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
371 $strip_time =~ /\b(Mar[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
372 $strip_time =~ /\b(Apr[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
373 $strip_time =~ /\b(May[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
374 $strip_time =~ /\b(Jun[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
375 $strip_time =~ /\b(Jul[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
376 $strip_time =~ /\b(Aug[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
377 $strip_time =~ /\b(Sep[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
378 $strip_time =~ /\b(Oct[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
379 $strip_time =~ /\b(Nov[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || | |
380 $strip_time =~ /\b(Dec[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i) | |
381 { | |
382 $temp = $1; | |
383 | |
384 if (has_text_month($temp)) | |
385 { | |
386 $date = $temp; | |
387 } | |
388 } | |
389 | |
390 # possibly a numeric date | |
391 elsif ($strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,2})\b/ || | |
392 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4}[- \/]+[0-9]{1,2})\b/ || | |
393 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,4})\b/ || | |
394 $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4})\b/ || | |
395 $strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2})\b/) | |
396 { | |
397 $date = $1; | |
398 } | |
399 | |
400 # be sure that date and time anchor the ends | |
401 # mix of time and date | |
402 if ($time =~ /\S/ && $date =~ /\S/) | |
403 { | |
404 if ($array[$i] =~ /^\Q$time\E(.*)\Q$date\E$/ || | |
405 $array[$i] =~ /^\Q$date\E(.*)\Q$time\E$/) | |
406 { | |
407 $middle = $1; | |
408 | |
409 # allow blank | |
410 # allow for purely whitespace | |
411 # allow for a single hyphen, slash, comma | |
412 # allow for multiple spaces before and/or after | |
413 if ($middle eq '' || | |
414 $middle =~ /^\s+$/ || | |
415 $middle =~ /^\s*[-\/,]\s*$/) | |
416 { | |
417 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
418 } | |
419 } | |
420 } | |
421 # only time | |
422 elsif ($time =~ /\S/) | |
423 { | |
424 if ($array[$i] =~ /^\Q$time\E$/) | |
425 { | |
426 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
427 } | |
428 } | |
429 # only date | |
430 elsif ($date =~ /\S/) | |
431 { | |
432 if ($array[$i] =~ /^\Q$date\E$/) | |
433 { | |
434 $array[$i] = sprintf "=\"%s\"", $array[$i]; | |
435 } | |
436 } | |
437 } | |
438 } | |
439 } | |
440 | |
441 # make the new escaped line | |
442 $line_escaped = join "\t", @array; | |
443 | |
444 # print it | |
445 print OUTFILE "$line_escaped\n"; | |
446 } | |
447 close INFILE; | |
448 | |
449 close OUTFILE; |