view escape_excel.pl @ 2:482c23a5abfe draft default tip

Uploaded Python wrapper
author pstew
date Fri, 17 Feb 2017 16:38:30 -0500
parents 7726adcf91c6
children
line wrap: on
line source

#!/usr/bin/perl -w

use Scalar::Util qw(looks_like_number);

$date_abbrev_hash{'jan'} = 'january';
$date_abbrev_hash{'feb'} = 'february';
$date_abbrev_hash{'mar'} = 'march';
$date_abbrev_hash{'apr'} = 'april';
$date_abbrev_hash{'may'} = 'may';
$date_abbrev_hash{'jun'} = 'jun';
$date_abbrev_hash{'jul'} = 'july';
$date_abbrev_hash{'aug'} = 'august';
$date_abbrev_hash{'sep'} = 'september';
$date_abbrev_hash{'oct'} = 'october';
$date_abbrev_hash{'nov'} = 'november';
$date_abbrev_hash{'dec'} = 'december';


sub is_number
{
    # use what Perl thinks is a number first
    if (looks_like_number($_[0]))
    {
        # Perl treats infinities as numbers, Excel does not
        if ($_[0] =~ /^[+-]*inf/)
        {
            return 0;
        }
        
        return 1;
    }

    # Perl cannot handle American comma separators within long numbers.
    # Excel does, so we have to check for it.
    # Excel doesn't handle European dot separators, at least not when it is
    #  set to the US locale (my test environment).  I am going to leave this
    #  unsupported for now.
    #
    return ($_[0] =~ /^([+-]?)[0-9]+(,\d\d\d)*([Ee]([+-]?[0-9]+))?$/);
}


sub has_text_month
{
    my $date_str = $_[0];
    my $abbrev;
    my $full;
    my $xor;
    my $prefix_length;

    $candidate = '';
    if ($date_str =~ /^([0-9]{1,4}[- \/]*)?([A-Za-z]{3,9})/)
    {
        $candidate = lc $2;
    }

    if ($candidate eq '')
    {
        return 0;
    }

    $abbrev = substr $candidate, 0, 3;
    $full = $date_abbrev_hash{$abbrev};

    # first three letters are not the start of a month
    if (!defined($full))
    {
        return 0;
    }

    # find common prefix
    $xor = "$candidate" ^ "$full";
    $xor =~ /^\0*/;
    $prefix_length = $+[0];
    
    # if the common prefix is the same as the full candidate, it is real
    if (length $candidate eq $prefix_length)
    {
        return 1;
    }
    
    return 0;
}

$escape_excel_paranoid_flag = 0;
$escape_sci_flag = 1;
$escape_zeroes_flag = 1;
$escape_dates_flag = 1;

# read in command line arguments
$num_files = 0;
for ($i = 0; $i < @ARGV; $i++)
{
    $field = $ARGV[$i];

    if ($field =~ /^-/)
    {
        if ($field eq '--paranoid')
        {
            if ($escape_excel_paranoid_flag == 0)
            {
                $escape_excel_paranoid_flag = 1;
            }
            else
            {
                $escape_excel_paranoid_flag = 0;
            }
        }
        elsif ($field eq '--no-sci')
        {
            $escape_sci_flag = 0;
        }
        elsif ($field eq '--no-zeroes')
        {
            $escape_zeroes_flag = 0;
        }
        elsif ($field eq '--no-dates')
        {
            $escape_dates_flag = 0;
        }
        else
        {
            printf "ABORT -- unknown option %s\n", $field;
            $syntax_error_flag = 1;
        }
    }
    else
    {
        if ($num_files == 1)
        {
            $outname = $field;
            $num_files++;
        }
        if ($num_files == 0)
        {
            $filename = $field;
            $num_files++;
        }
    }
}

# default to stdin if no filename given
if ($num_files == 0)
{
    $filename = '-';
    $num_files = 1;
}


# print syntax error message
if ($num_files == 0 || $syntax_error_flag)
{
    printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\n";
    printf STDERR "   Options:\n";
    printf STDERR "      --no-dates   Do not escape text that looks like dates\n";
    printf STDERR "      --no-sci     Do not escape > #E (ex: 12E4) or >11 digit integer parts\n";
    printf STDERR "      --no-zeroes  Do not escape leading zeroes (ie. 012345)\n";
    printf STDERR "      --paranoid   Escape *ALL* non-numeric text\n";
    printf STDERR "                   WARNING -- Excel can take a LONG time to import\n";
    printf STDERR "                   text files where most fields are escaped.\n";
    printf STDERR "                   Copy / Paste Values can become near unusuable....\n";
    printf STDERR "\n";
    printf STDERR "   Input file must be tab-delimited.\n";
    printf STDERR "   Fields will be stripped of existing =\"\" escapes, enclosing \"\", leading \",\n";
    printf STDERR "    and leading/trailing spaces, as they may all cause problems.\n";
    printf STDERR "\n";
    printf STDERR "   Defaults to escaping most Excel mis-imported fields.\n";
    printf STDERR "   Escapes a few extra date-like formats that Excel does not consider dates.\n";
    printf STDERR "   Please send unhandled mis-imported field examples (other than gene symbols\n";
    printf STDERR "    with 1-digit scientific notation, such as 2e4) to Eric.Welsh\@moffitt.org.\n";
    printf STDERR "\n";
    printf STDERR "   Copy / Paste Values in Excel, after importing, to de-escape back into text.\n";
    exit(1);
}


# output to STDOUT
if ($num_files == 1)
{
    *OUTFILE = STDOUT;
}
# output to specified file name
if ($num_files == 2)
{
    open OUTFILE, ">$outname" or die "can't open output $outname\n";
}


# read in, escape, and print escaped lines
open INFILE, "$filename" or die "can't open $filename\n";
while(defined($line=<INFILE>))
{
    # strip newline characters
    $line =~ s/[\r\n]+//g;

    @array = split /\t/, $line;

    # Strip any leading UTF-8 byte order mark so it won't corrupt the
    #  first field, since regular Perl I/O is not byte order mark aware.
    #
    # https://en.wikipedia.org/wiki/Byte_order_mark
    #
    # Various Microsoft products can emit these and screw things up....
    #
    for ($i = 0; $i < @array; $i++)
    {
        $line =~ s/^//;
    }
    
    for ($i = 0; $i < @array; $i++)
    {
        # continue stripping problematic stuff until all has been stripped
        do
        {
            $changed_flag = 0;
            
            # remove pre-existing escapes or start/end double quotes,
            # since either messes up ="" escapes
            while ($array[$i] =~ s/^\=*\"(.*?)\"$/$1/)
            {
                $changed_flag = 1;
            }
        
            # remove leading ", since they mess up Excel in general
            #
            # this must be done after "", but before leading/trailing spaces,
            # since removing leading/trailing spaces could result in more
            # full "" enclosures, which would then be messed up by removing
            # only the leading "
            #
            while ($array[$i] =~ s/^\"//)
            {
                $changed_flag = 1;
            }

            # remove leading spaces, since they won't protect long numbers
            if ($array[$i] =~ s/^\s+//)
            {
                $changed_flag = 1;
            }

            # remove trailing spaces, since they won't protect dates
            if ($array[$i] =~ s/\s+$//)
            {
                $changed_flag = 1;
            }
        } while ($changed_flag)
    }
    
    # escape fields
    for ($i = 0; $i < @array; $i++)
    {
        # Strange but true -- 'text doesn't escape text properly in Excel
        # when you try to use it in a text file to import.  It will not
        # auto-strip the leading ' like it does when you type it in a live
        # spreadsheet.  "text" doesn't, either.  Oddly, ="text" DOES work,
        # but an equation containing just a text string and no actual
        # equation doesn't make much sense.  However, it works, so that's
        # what I use here to escape fields into mangle-protected text.

        # escape numeric problems
        if (is_number($array[$i]))
        {
          # keep leading zeroes for >1 digit before the decimal point
          if ($escape_zeroes_flag && $array[$i] =~ /^([+-]?)0[0-9]/)
          {
              $array[$i] = sprintf "=\"%s\"", $array[$i];
          }

          # Escape scientific notation with >= 2 digits before the E,
          #  since they are likely accessions or plate/well identifiers.
          #
          # Also escape numbers with >11 digits before the decimal point.
          # >11 is when it displays scientific notation in General format,
          #  which can result in corruption when saved to text.
          # >15 would be the limit at which it loses precision internally.
          #
          # NOTE -- if there is a + or - at the beginning, this rule
          #         will not trigger.  Undecided if this is desired or not.
          #         Probably desired behavior, since +/- would indicate that
          #         it is probably a true number, and not an accession or
          #         plate/well identifier.
          #
          elsif ($escape_sci_flag)
          {
              # strip commas before counting digits
              $temp = $array[$i];
              $temp =~ s/\,//g;
              
              if ($temp =~ /^([1-9][0-9]{11,}|[0-9]{2,}[eE])/)
              {
                  $array[$i] = sprintf "=\"%s\"", $array[$i];
              }
          }
        }
        # escape all text if paranoid
        elsif ($escape_excel_paranoid_flag)
        {
          $array[$i] = sprintf "=\"%s\"", $array[$i];
        }
        # escape dates
        elsif ($escape_dates_flag)
        {
          # escape single quote at beginning of line
          if ($array[$i] =~ /^'/)
          {
              $array[$i] = sprintf "=\"%s\"", $array[$i];
          }

          # prevent conversion into formulas
          elsif ($array[$i] =~ /^\=/)
          {
              $array[$i] = sprintf "=\"%s\"", $array[$i];
          }
          # Excel is smart enough to treat all +/- as not an equation
          #  but, otherwise, it will convert anything starting with +/-
          #  into "#NAME?" as a failed invalid equation
          elsif ($array[$i] =~ /^[+-]/ && !($array[$i] =~ /^[+-]+$/))
          {
              $array[$i] = sprintf "=\"%s\"", $array[$i];
          }

          # check for time and/or date stuff
          else
          {
              $time = '';
              $date = '';
          
              # attempt to guess at how excel might autoconvert into time
              # allow letter/punctuation at end if it could be part of a date
              #  it would get too complicated to handle date-ness correctly,
              #  since I'm already resorting to negative look-ahead
              if ($array[$i] =~ /\b(([0-9]+\s+(AM|PM|A|P)|[0-9]+:[0-9]+(:[0-9.]+)?)(\s+(AM|PM|A|P))?)(?!([^-\/, 0-9ADFJMNOSadfjmnos]))/)
              {
                  $time = $1;
              }
              
              $strip_time = $array[$i];
              if ($time =~ /\S/)
              {
                  $strip_time =~ s/\Q$time\E//;
                  $strip_time =~ s/^\s+//;
                  $strip_time =~ s/\s+$//
              }

              # text date, month in the middle
              if ($strip_time =~ /\b([0-9]{1,4}[- \/]*Jan[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Feb[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Mar[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Apr[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*May[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Jun[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Jul[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Aug[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Sep[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Oct[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Nov[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i ||
                  $strip_time =~ /\b([0-9]{1,4}[- \/]*Dec[A-Za-z]{0,6}([- \/]*[0-9]{1,4})?)\b/i)
              {
                  $temp = $1;
              
                  if (has_text_month($temp))
                  {
                      $date = $temp;
                  }
              }

              # text date, month first
              elsif ($strip_time =~ /\b(Jan[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Feb[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Mar[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Apr[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(May[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Jun[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Jul[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Aug[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Sep[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Oct[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Nov[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i ||
                     $strip_time =~ /\b(Dec[A-Za-z]{0,6}[- \/]*[0-9]{1,4}([- \/]+[0-9]{1,4})?)\b/i)
              {
                  $temp = $1;

                  if (has_text_month($temp))
                  {
                      $date = $temp;
                  }
              }

              # possibly a numeric date
              elsif ($strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,2})\b/ ||
                     $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4}[- \/]+[0-9]{1,2})\b/ ||
                     $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
                     $strip_time =~ /\b([0-9]{1,2}[- \/]+[0-9]{1,4})\b/ ||
                     $strip_time =~ /\b([0-9]{1,4}[- \/]+[0-9]{1,2})\b/)
              {
                  $date = $1;
              }
              
              # be sure that date and time anchor the ends
              # mix of time and date
              if ($time =~ /\S/ && $date =~ /\S/)
              {
                  if ($array[$i] =~ /^\Q$time\E(.*)\Q$date\E$/ ||
                      $array[$i] =~ /^\Q$date\E(.*)\Q$time\E$/)
                  {
                      $middle = $1;

                      # allow blank
                      # allow for purely whitespace
                      # allow for a single hyphen, slash, comma
                      #  allow for multiple spaces before and/or after
                      if ($middle eq '' ||
                          $middle =~ /^\s+$/ ||
                          $middle =~ /^\s*[-\/,]\s*$/)
                      {
                          $array[$i] = sprintf "=\"%s\"", $array[$i];
                      }
                  }
              }
              # only time
              elsif ($time =~ /\S/)
              {
                  if ($array[$i] =~ /^\Q$time\E$/)
                  {
                      $array[$i] = sprintf "=\"%s\"", $array[$i];
                  }
              }
              # only date
              elsif ($date =~ /\S/)
              {
                  if ($array[$i] =~ /^\Q$date\E$/)
                  {
                      $array[$i] = sprintf "=\"%s\"", $array[$i];
                  }
              }
          }
        }
    }
    
    # make the new escaped line
    $line_escaped = join "\t", @array;
    
    # print it
    print OUTFILE "$line_escaped\n";
}
close INFILE;

close OUTFILE;