Mercurial > repos > pstew > escape_excel
changeset 1:7726adcf91c6 draft
Uploaded Escape Excel Perl script
author | pstew |
---|---|
date | Fri, 17 Feb 2017 16:38:12 -0500 |
parents | 45f9b77eda87 |
children | 482c23a5abfe |
files | escape_excel.pl |
diffstat | 1 files changed, 449 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/escape_excel.pl Fri Feb 17 16:38:12 2017 -0500 @@ -0,0 +1,449 @@ +#!/usr/bin/perl -w + +use Scalar::Util qw(looks_like_number); + +$date_abbrev_hash{'jan'} = 'january'; +$date_abbrev_hash{'feb'} = 'february'; +$date_abbrev_hash{'mar'} = 'march'; +$date_abbrev_hash{'apr'} = 'april'; +$date_abbrev_hash{'may'} = 'may'; +$date_abbrev_hash{'jun'} = 'jun'; +$date_abbrev_hash{'jul'} = 'july'; +$date_abbrev_hash{'aug'} = 'august'; +$date_abbrev_hash{'sep'} = 'september'; +$date_abbrev_hash{'oct'} = 'october'; +$date_abbrev_hash{'nov'} = 'november'; +$date_abbrev_hash{'dec'} = 'december'; + + +sub is_number +{ + # use what Perl thinks is a number first + if (looks_like_number($_[0])) + { + # Perl treats infinities as numbers, Excel does not + if ($_[0] =~ /^[+-]*inf/) + { + return 0; + } + + return 1; + } + + # Perl cannot handle American comma separators within long numbers. + # Excel does, so we have to check for it. + # Excel doesn't handle European dot separators, at least not when it is + # set to the US locale (my test environment). I am going to leave this + # unsupported for now. + # + return ($_[0] =~ /^([+-]?)[0-9]+(,\d\d\d)*([Ee]([+-]?[0-9]+))?$/); +} + + +sub has_text_month +{ + my $date_str = $_[0]; + my $abbrev; + my $full; + my $xor; + my $prefix_length; + + $candidate = ''; + if ($date_str =~ /^([0-9]{1,4}[- \/]*)?([A-Za-z]{3,9})/) + { + $candidate = lc $2; + } + + if ($candidate eq '') + { + return 0; + } + + $abbrev = substr $candidate, 0, 3; + $full = $date_abbrev_hash{$abbrev}; + + # first three letters are not the start of a month + if (!defined($full)) + { + return 0; + } + + # find common prefix + $xor = "$candidate" ^ "$full"; + $xor =~ /^\0*/; + $prefix_length = $+[0]; + + # if the common prefix is the same as the full candidate, it is real + if (length $candidate eq $prefix_length) + { + return 1; + } + + return 0; +} + +$escape_excel_paranoid_flag = 0; +$escape_sci_flag = 1; +$escape_zeroes_flag = 1; +$escape_dates_flag = 1; + +# read in command line arguments +$num_files = 0; +for ($i = 0; $i < @ARGV; $i++) +{ + $field = $ARGV[$i]; + + if ($field =~ /^-/) + { + if ($field eq '--paranoid') + { + if ($escape_excel_paranoid_flag == 0) + { + $escape_excel_paranoid_flag = 1; + } + else + { + $escape_excel_paranoid_flag = 0; + } + } + elsif ($field eq '--no-sci') + { + $escape_sci_flag = 0; + } + elsif ($field eq '--no-zeroes') + { + $escape_zeroes_flag = 0; + } + elsif ($field eq '--no-dates') + { + $escape_dates_flag = 0; + } + else + { + printf "ABORT -- unknown option %s\n", $field; + $syntax_error_flag = 1; + } + } + else + { + if ($num_files == 1) + { + $outname = $field; + $num_files++; + } + if ($num_files == 0) + { + $filename = $field; + $num_files++; + } + } +} + +# default to stdin if no filename given +if ($num_files == 0) +{ + $filename = '-'; + $num_files = 1; +} + + +# print syntax error message +if ($num_files == 0 || $syntax_error_flag) +{ + printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\n"; + printf STDERR " Options:\n"; + printf STDERR " --no-dates Do not escape text that looks like dates\n"; + printf STDERR " --no-sci Do not escape > #E (ex: 12E4) or >11 digit integer parts\n"; + printf STDERR " --no-zeroes Do not escape leading zeroes (ie. 012345)\n"; + printf STDERR " --paranoid Escape *ALL* non-numeric text\n"; + printf STDERR " WARNING -- Excel can take a LONG time to import\n"; + printf STDERR " text files where most fields are escaped.\n"; + printf STDERR " Copy / Paste Values can become near unusuable....\n"; + printf STDERR "\n"; + printf STDERR " Input file must be tab-delimited.\n"; + printf STDERR " Fields will be stripped of existing =\"\" escapes, enclosing \"\", leading \",\n"; + printf STDERR " and leading/trailing spaces, as they may all cause problems.\n"; + printf STDERR "\n"; + printf STDERR " Defaults to escaping most Excel mis-imported fields.\n"; + printf STDERR " Escapes a few extra date-like formats that Excel does not consider dates.\n"; + printf STDERR " Please send unhandled mis-imported field examples (other than gene symbols\n"; + printf STDERR " with 1-digit scientific notation, such as 2e4) to Eric.Welsh\@moffitt.org.\n"; + printf STDERR "\n"; + printf STDERR " Copy / Paste Values in Excel, after importing, to de-escape back into text.\n"; + exit(1); +} + + +# output to STDOUT +if ($num_files == 1) +{ + *OUTFILE = STDOUT; +} +# output to specified file name +if ($num_files == 2) +{ + open OUTFILE, ">$outname" or die "can't open output $outname\n"; +} + + +# read in, escape, and print escaped lines +open INFILE, "$filename" or die "can't open $filename\n"; +while(defined($line=<INFILE>)) +{ + # strip newline characters + $line =~ s/[\r\n]+//g; + + @array = split /\t/, $line; + + # Strip any leading UTF-8 byte order mark so it won't corrupt the + # first field, since regular Perl I/O is not byte order mark aware. + # + # https://en.wikipedia.org/wiki/Byte_order_mark + # + # Various Microsoft products can emit these and screw things up.... + # + for ($i = 0; $i < @array; $i++) + { + $line =~ s/^//; + } + + for ($i = 0; $i < @array; $i++) + { + # continue stripping problematic stuff until all has been stripped + do + { + $changed_flag = 0; + + # remove pre-existing escapes or start/end double quotes, + # since either messes up ="" escapes + while ($array[$i] =~ s/^\=*\"(.*?)\"$/$1/) + { + $changed_flag = 1; + } + + # remove leading ", since they mess up Excel in general + # + # this must be done after "", but before leading/trailing spaces, + # since removing leading/trailing spaces could result in more + # full "" enclosures, which would then be messed up by removing + # only the leading " + # + while ($array[$i] =~ s/^\"//) + { + $changed_flag = 1; + } + + # remove leading spaces, since they won't protect long numbers + if ($array[$i] =~ s/^\s+//) + { + $changed_flag = 1; + } + + # remove trailing spaces, since they won't protect dates + if ($array[$i] =~ s/\s+$//) + { + $changed_flag = 1; + } + } while ($changed_flag) + } + + # escape fields + for ($i = 0; $i < @array; $i++) + { + # Strange but true -- 'text doesn't escape text properly in Excel + # when you try to use it in a text file to import. It will not + # auto-strip the leading ' like it does when you type it in a live + # spreadsheet. "text" doesn't, either. Oddly, ="text" DOES work, + # but an equation containing just a text string and no actual + # equation doesn't make much sense. However, it works, so that's + # what I use here to escape fields into mangle-protected text. + + # escape numeric problems + if (is_number($array[$i])) + { + # keep leading zeroes for >1 digit before the decimal point + if ($escape_zeroes_flag && $array[$i] =~ /^([+-]?)0[0-9]/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + + # Escape scientific notation with >= 2 digits before the E, + # since they are likely accessions or plate/well identifiers. + # + # Also escape numbers with >11 digits before the decimal point. + # >11 is when it displays scientific notation in General format, + # which can result in corruption when saved to text. + # >15 would be the limit at which it loses precision internally. + # + # NOTE -- if there is a + or - at the beginning, this rule + # will not trigger. Undecided if this is desired or not. + # Probably desired behavior, since +/- would indicate that + # it is probably a true number, and not an accession or + # plate/well identifier. + # + elsif ($escape_sci_flag) + { + # strip commas before counting digits + $temp = $array[$i]; + $temp =~ s/\,//g; + + if ($temp =~ /^([1-9][0-9]{11,}|[0-9]{2,}[eE])/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + } + } + # escape all text if paranoid + elsif ($escape_excel_paranoid_flag) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + # escape dates + elsif ($escape_dates_flag) + { + # escape single quote at beginning of line + if ($array[$i] =~ /^'/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + + # prevent conversion into formulas + elsif ($array[$i] =~ /^\=/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + # Excel is smart enough to treat all +/- as not an equation + # but, otherwise, it will convert anything starting with +/- + # into "#NAME?" as a failed invalid equation + elsif ($array[$i] =~ /^[+-]/ && !($array[$i] =~ /^[+-]+$/)) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + + # check for time and/or date stuff + else + { + $time = ''; + $date = ''; + + # attempt to guess at how excel might autoconvert into time + # allow letter/punctuation at end if it could be part of a date + # it would get too complicated to handle date-ness correctly, + # since I'm already resorting to negative look-ahead + if ($array[$i] =~ /\b(([0-9]+\s+(AM|PM|A|P)|[0-9]+:[0-9]+(:[0-9.]+)?)(\s+(AM|PM|A|P))?)(?!([^-\/, 0-9ADFJMNOSadfjmnos]))/) + { + $time = $1; + } + + $strip_time = $array[$i]; + if ($time =~ /\S/) + { + $strip_time =~ s/\Q$time\E//; + $strip_time =~ s/^\s+//; + $strip_time =~ s/\s+$// + } + + # text date, month in the middle + if ($strip_time =~ /\b([0-9]{1,4}[- \/]*Jan[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Feb[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Mar[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Apr[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*May[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Jun[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Jul[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Aug[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Sep[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Oct[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Nov[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i || + $strip_time =~ /\b([0-9]{1,4}[- \/]*Dec[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i) + { + $temp = $1; + + if (has_text_month($temp)) + { + $date = $temp; + } + } + + # text date, month first + elsif ($strip_time =~ /\b(Jan[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Feb[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Mar[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Apr[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(May[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Jun[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Jul[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Aug[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Sep[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Oct[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Nov[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i || + $strip_time =~ /\b(Dec[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i) + { + $temp = $1; + + if (has_text_month($temp)) + { + $date = $temp; + } + } + + # possibly a numeric date + elsif ($strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,2})\b/ || + $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4}[- \/]+[0-9]{1,2})\b/ || + $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,4})\b/ || + $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4})\b/ || + $strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2})\b/) + { + $date = $1; + } + + # be sure that date and time anchor the ends + # mix of time and date + if ($time =~ /\S/ && $date =~ /\S/) + { + if ($array[$i] =~ /^\Q$time\E(.*)\Q$date\E$/ || + $array[$i] =~ /^\Q$date\E(.*)\Q$time\E$/) + { + $middle = $1; + + # allow blank + # allow for purely whitespace + # allow for a single hyphen, slash, comma + # allow for multiple spaces before and/or after + if ($middle eq '' || + $middle =~ /^\s+$/ || + $middle =~ /^\s*[-\/,]\s*$/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + } + } + # only time + elsif ($time =~ /\S/) + { + if ($array[$i] =~ /^\Q$time\E$/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + } + # only date + elsif ($date =~ /\S/) + { + if ($array[$i] =~ /^\Q$date\E$/) + { + $array[$i] = sprintf "=\"%s\"", $array[$i]; + } + } + } + } + } + + # make the new escaped line + $line_escaped = join "\t", @array; + + # print it + print OUTFILE "$line_escaped\n"; +} +close INFILE; + +close OUTFILE;