changeset 1:7726adcf91c6 draft

Uploaded Escape Excel Perl script
author pstew
date Fri, 17 Feb 2017 16:38:12 -0500
parents 45f9b77eda87
children 482c23a5abfe
files escape_excel.pl
diffstat 1 files changed, 449 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/escape_excel.pl	Fri Feb 17 16:38:12 2017 -0500
@@ -0,0 +1,449 @@
+#!/usr/bin/perl -w
+
+use Scalar::Util qw(looks_like_number);
+
+$date_abbrev_hash{'jan'} = 'january';
+$date_abbrev_hash{'feb'} = 'february';
+$date_abbrev_hash{'mar'} = 'march';
+$date_abbrev_hash{'apr'} = 'april';
+$date_abbrev_hash{'may'} = 'may';
+$date_abbrev_hash{'jun'} = 'jun';
+$date_abbrev_hash{'jul'} = 'july';
+$date_abbrev_hash{'aug'} = 'august';
+$date_abbrev_hash{'sep'} = 'september';
+$date_abbrev_hash{'oct'} = 'october';
+$date_abbrev_hash{'nov'} = 'november';
+$date_abbrev_hash{'dec'} = 'december';
+
+
+sub is_number
+{
+    # use what Perl thinks is a number first
+    if (looks_like_number($_[0]))
+    {
+        # Perl treats infinities as numbers, Excel does not
+        if ($_[0] =~ /^[+-]*inf/)
+        {
+            return 0;
+        }
+        
+        return 1;
+    }
+
+    # Perl cannot handle American comma separators within long numbers.
+    # Excel does, so we have to check for it.
+    # Excel doesn't handle European dot separators, at least not when it is
+    #  set to the US locale (my test environment).  I am going to leave this
+    #  unsupported for now.
+    #
+    return ($_[0] =~ /^([+-]?)[0-9]+(,\d\d\d)*([Ee]([+-]?[0-9]+))?$/);
+}
+
+
+sub has_text_month
+{
+    my $date_str = $_[0];
+    my $abbrev;
+    my $full;
+    my $xor;
+    my $prefix_length;
+
+    $candidate = '';
+    if ($date_str =~ /^([0-9]{1,4}[- \/]*)?([A-Za-z]{3,9})/)
+    {
+        $candidate = lc $2;
+    }
+
+    if ($candidate eq '')
+    {
+        return 0;
+    }
+
+    $abbrev = substr $candidate, 0, 3;
+    $full = $date_abbrev_hash{$abbrev};
+
+    # first three letters are not the start of a month
+    if (!defined($full))
+    {
+        return 0;
+    }
+
+    # find common prefix
+    $xor = "$candidate" ^ "$full";
+    $xor =~ /^\0*/;
+    $prefix_length = $+[0];
+    
+    # if the common prefix is the same as the full candidate, it is real
+    if (length $candidate eq $prefix_length)
+    {
+        return 1;
+    }
+    
+    return 0;
+}
+
+$escape_excel_paranoid_flag = 0;
+$escape_sci_flag = 1;
+$escape_zeroes_flag = 1;
+$escape_dates_flag = 1;
+
+# read in command line arguments
+$num_files = 0;
+for ($i = 0; $i < @ARGV; $i++)
+{
+    $field = $ARGV[$i];
+
+    if ($field =~ /^-/)
+    {
+        if ($field eq '--paranoid')
+        {
+            if ($escape_excel_paranoid_flag == 0)
+            {
+                $escape_excel_paranoid_flag = 1;
+            }
+            else
+            {
+                $escape_excel_paranoid_flag = 0;
+            }
+        }
+        elsif ($field eq '--no-sci')
+        {
+            $escape_sci_flag = 0;
+        }
+        elsif ($field eq '--no-zeroes')
+        {
+            $escape_zeroes_flag = 0;
+        }
+        elsif ($field eq '--no-dates')
+        {
+            $escape_dates_flag = 0;
+        }
+        else
+        {
+            printf "ABORT -- unknown option %s\n", $field;
+            $syntax_error_flag = 1;
+        }
+    }
+    else
+    {
+        if ($num_files == 1)
+        {
+            $outname = $field;
+            $num_files++;
+        }
+        if ($num_files == 0)
+        {
+            $filename = $field;
+            $num_files++;
+        }
+    }
+}
+
+# default to stdin if no filename given
+if ($num_files == 0)
+{
+    $filename = '-';
+    $num_files = 1;
+}
+
+
+# print syntax error message
+if ($num_files == 0 || $syntax_error_flag)
+{
+    printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\n";
+    printf STDERR "   Options:\n";
+    printf STDERR "      --no-dates   Do not escape text that looks like dates\n";
+    printf STDERR "      --no-sci     Do not escape > #E (ex: 12E4) or >11 digit integer parts\n";
+    printf STDERR "      --no-zeroes  Do not escape leading zeroes (ie. 012345)\n";
+    printf STDERR "      --paranoid   Escape *ALL* non-numeric text\n";
+    printf STDERR "                   WARNING -- Excel can take a LONG time to import\n";
+    printf STDERR "                   text files where most fields are escaped.\n";
+    printf STDERR "                   Copy / Paste Values can become near unusuable....\n";
+    printf STDERR "\n";
+    printf STDERR "   Input file must be tab-delimited.\n";
+    printf STDERR "   Fields will be stripped of existing =\"\" escapes, enclosing \"\", leading \",\n";
+    printf STDERR "    and leading/trailing spaces, as they may all cause problems.\n";
+    printf STDERR "\n";
+    printf STDERR "   Defaults to escaping most Excel mis-imported fields.\n";
+    printf STDERR "   Escapes a few extra date-like formats that Excel does not consider dates.\n";
+    printf STDERR "   Please send unhandled mis-imported field examples (other than gene symbols\n";
+    printf STDERR "    with 1-digit scientific notation, such as 2e4) to Eric.Welsh\@moffitt.org.\n";
+    printf STDERR "\n";
+    printf STDERR "   Copy / Paste Values in Excel, after importing, to de-escape back into text.\n";
+    exit(1);
+}
+
+
+# output to STDOUT
+if ($num_files == 1)
+{
+    *OUTFILE = STDOUT;
+}
+# output to specified file name
+if ($num_files == 2)
+{
+    open OUTFILE, ">$outname" or die "can't open output $outname\n";
+}
+
+
+# read in, escape, and print escaped lines
+open INFILE, "$filename" or die "can't open $filename\n";
+while(defined($line=<INFILE>))
+{
+    # strip newline characters
+    $line =~ s/[\r\n]+//g;
+
+    @array = split /\t/, $line;
+
+    # Strip any leading UTF-8 byte order mark so it won't corrupt the
+    #  first field, since regular Perl I/O is not byte order mark aware.
+    #
+    # https://en.wikipedia.org/wiki/Byte_order_mark
+    #
+    # Various Microsoft products can emit these and screw things up....
+    #
+    for ($i = 0; $i < @array; $i++)
+    {
+        $line =~ s/^//;
+    }
+    
+    for ($i = 0; $i < @array; $i++)
+    {
+        # continue stripping problematic stuff until all has been stripped
+        do
+        {
+            $changed_flag = 0;
+            
+            # remove pre-existing escapes or start/end double quotes,
+            # since either messes up ="" escapes
+            while ($array[$i] =~ s/^\=*\"(.*?)\"$/$1/)
+            {
+                $changed_flag = 1;
+            }
+        
+            # remove leading ", since they mess up Excel in general
+            #
+            # this must be done after "", but before leading/trailing spaces,
+            # since removing leading/trailing spaces could result in more
+            # full "" enclosures, which would then be messed up by removing
+            # only the leading "
+            #
+            while ($array[$i] =~ s/^\"//)
+            {
+                $changed_flag = 1;
+            }
+
+            # remove leading spaces, since they won't protect long numbers
+            if ($array[$i] =~ s/^\s+//)
+            {
+                $changed_flag = 1;
+            }
+
+            # remove trailing spaces, since they won't protect dates
+            if ($array[$i] =~ s/\s+$//)
+            {
+                $changed_flag = 1;
+            }
+        } while ($changed_flag)
+    }
+    
+    # escape fields
+    for ($i = 0; $i < @array; $i++)
+    {
+        # Strange but true -- 'text doesn't escape text properly in Excel
+        # when you try to use it in a text file to import.  It will not
+        # auto-strip the leading ' like it does when you type it in a live
+        # spreadsheet.  "text" doesn't, either.  Oddly, ="text" DOES work,
+        # but an equation containing just a text string and no actual
+        # equation doesn't make much sense.  However, it works, so that's
+        # what I use here to escape fields into mangle-protected text.
+
+        # escape numeric problems
+        if (is_number($array[$i]))
+        {
+          # keep leading zeroes for >1 digit before the decimal point
+          if ($escape_zeroes_flag && $array[$i] =~ /^([+-]?)0[0-9]/)
+          {
+              $array[$i] = sprintf "=\"%s\"", $array[$i];
+          }
+
+          # Escape scientific notation with >= 2 digits before the E,
+          #  since they are likely accessions or plate/well identifiers.
+          #
+          # Also escape numbers with >11 digits before the decimal point.
+          # >11 is when it displays scientific notation in General format,
+          #  which can result in corruption when saved to text.
+          # >15 would be the limit at which it loses precision internally.
+          #
+          # NOTE -- if there is a + or - at the beginning, this rule
+          #         will not trigger.  Undecided if this is desired or not.
+          #         Probably desired behavior, since +/- would indicate that
+          #         it is probably a true number, and not an accession or
+          #         plate/well identifier.
+          #
+          elsif ($escape_sci_flag)
+          {
+              # strip commas before counting digits
+              $temp = $array[$i];
+              $temp =~ s/\,//g;
+              
+              if ($temp =~ /^([1-9][0-9]{11,}|[0-9]{2,}[eE])/)
+              {
+                  $array[$i] = sprintf "=\"%s\"", $array[$i];
+              }
+          }
+        }
+        # escape all text if paranoid
+        elsif ($escape_excel_paranoid_flag)
+        {
+          $array[$i] = sprintf "=\"%s\"", $array[$i];
+        }
+        # escape dates
+        elsif ($escape_dates_flag)
+        {
+          # escape single quote at beginning of line
+          if ($array[$i] =~ /^'/)
+          {
+              $array[$i] = sprintf "=\"%s\"", $array[$i];
+          }
+
+          # prevent conversion into formulas
+          elsif ($array[$i] =~ /^\=/)
+          {
+              $array[$i] = sprintf "=\"%s\"", $array[$i];
+          }
+          # Excel is smart enough to treat all +/- as not an equation
+          #  but, otherwise, it will convert anything starting with +/-
+          #  into "#NAME?" as a failed invalid equation
+          elsif ($array[$i] =~ /^[+-]/ && !($array[$i] =~ /^[+-]+$/))
+          {
+              $array[$i] = sprintf "=\"%s\"", $array[$i];
+          }
+
+          # check for time and/or date stuff
+          else
+          {
+              $time = '';
+              $date = '';
+          
+              # attempt to guess at how excel might autoconvert into time
+              # allow letter/punctuation at end if it could be part of a date
+              #  it would get too complicated to handle date-ness correctly,
+              #  since I'm already resorting to negative look-ahead
+              if ($array[$i] =~ /\b(([0-9]+\s+(AM|PM|A|P)|[0-9]+:[0-9]+(:[0-9.]+)?)(\s+(AM|PM|A|P))?)(?!([^-\/, 0-9ADFJMNOSadfjmnos]))/)
+              {
+                  $time = $1;
+              }
+              
+              $strip_time = $array[$i];
+              if ($time =~ /\S/)
+              {
+                  $strip_time =~ s/\Q$time\E//;
+                  $strip_time =~ s/^\s+//;
+                  $strip_time =~ s/\s+$//
+              }
+
+              # text date, month in the middle
+              if ($strip_time =~ /\b([0-9]{1,4}[- \/]*Jan[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Feb[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Mar[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Apr[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*May[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Jun[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Jul[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Aug[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Sep[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Oct[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Nov[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
+                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Dec[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i)
+              {
+                  $temp = $1;
+              
+                  if (has_text_month($temp))
+                  {
+                      $date = $temp;
+                  }
+              }
+
+              # text date, month first
+              elsif ($strip_time =~ /\b(Jan[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Feb[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Mar[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Apr[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(May[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Jun[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Jul[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Aug[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Sep[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Oct[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Nov[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
+                     $strip_time =~ /\b(Dec[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i)
+              {
+                  $temp = $1;
+
+                  if (has_text_month($temp))
+                  {
+                      $date = $temp;
+                  }
+              }
+
+              # possibly a numeric date
+              elsif ($strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,2})\b/ ||
+                     $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4}[- \/]+[0-9]{1,2})\b/ ||
+                     $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
+                     $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
+                     $strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2})\b/)
+              {
+                  $date = $1;
+              }
+              
+              # be sure that date and time anchor the ends
+              # mix of time and date
+              if ($time =~ /\S/ && $date =~ /\S/)
+              {
+                  if ($array[$i] =~ /^\Q$time\E(.*)\Q$date\E$/ ||
+                      $array[$i] =~ /^\Q$date\E(.*)\Q$time\E$/)
+                  {
+                      $middle = $1;
+
+                      # allow blank
+                      # allow for purely whitespace
+                      # allow for a single hyphen, slash, comma
+                      #  allow for multiple spaces before and/or after
+                      if ($middle eq '' ||
+                          $middle =~ /^\s+$/ ||
+                          $middle =~ /^\s*[-\/,]\s*$/)
+                      {
+                          $array[$i] = sprintf "=\"%s\"", $array[$i];
+                      }
+                  }
+              }
+              # only time
+              elsif ($time =~ /\S/)
+              {
+                  if ($array[$i] =~ /^\Q$time\E$/)
+                  {
+                      $array[$i] = sprintf "=\"%s\"", $array[$i];
+                  }
+              }
+              # only date
+              elsif ($date =~ /\S/)
+              {
+                  if ($array[$i] =~ /^\Q$date\E$/)
+                  {
+                      $array[$i] = sprintf "=\"%s\"", $array[$i];
+                  }
+              }
+          }
+        }
+    }
+    
+    # make the new escaped line
+    $line_escaped = join "\t", @array;
+    
+    # print it
+    print OUTFILE "$line_escaped\n";
+}
+close INFILE;
+
+close OUTFILE;