Repository 'escape_excel'
hg clone https://toolshed.g2.bx.psu.edu/repos/pstew/escape_excel

Changeset 1:7726adcf91c6 (2017-02-17)
Previous changeset 0:45f9b77eda87 (2017-02-17) Next changeset 2:482c23a5abfe (2017-02-17)
Commit message:
Uploaded Escape Excel Perl script
added:
escape_excel.pl
b
diff -r 45f9b77eda87 -r 7726adcf91c6 escape_excel.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/escape_excel.pl Fri Feb 17 16:38:12 2017 -0500
[
b'@@ -0,0 +1,449 @@\n+#!/usr/bin/perl -w\n+\n+use Scalar::Util qw(looks_like_number);\n+\n+$date_abbrev_hash{\'jan\'} = \'january\';\n+$date_abbrev_hash{\'feb\'} = \'february\';\n+$date_abbrev_hash{\'mar\'} = \'march\';\n+$date_abbrev_hash{\'apr\'} = \'april\';\n+$date_abbrev_hash{\'may\'} = \'may\';\n+$date_abbrev_hash{\'jun\'} = \'jun\';\n+$date_abbrev_hash{\'jul\'} = \'july\';\n+$date_abbrev_hash{\'aug\'} = \'august\';\n+$date_abbrev_hash{\'sep\'} = \'september\';\n+$date_abbrev_hash{\'oct\'} = \'october\';\n+$date_abbrev_hash{\'nov\'} = \'november\';\n+$date_abbrev_hash{\'dec\'} = \'december\';\n+\n+\n+sub is_number\n+{\n+    # use what Perl thinks is a number first\n+    if (looks_like_number($_[0]))\n+    {\n+        # Perl treats infinities as numbers, Excel does not\n+        if ($_[0] =~ /^[+-]*inf/)\n+        {\n+            return 0;\n+        }\n+        \n+        return 1;\n+    }\n+\n+    # Perl cannot handle American comma separators within long numbers.\n+    # Excel does, so we have to check for it.\n+    # Excel doesn\'t handle European dot separators, at least not when it is\n+    #  set to the US locale (my test environment).  I am going to leave this\n+    #  unsupported for now.\n+    #\n+    return ($_[0] =~ /^([+-]?)[0-9]+(,\\d\\d\\d)*([Ee]([+-]?[0-9]+))?$/);\n+}\n+\n+\n+sub has_text_month\n+{\n+    my $date_str = $_[0];\n+    my $abbrev;\n+    my $full;\n+    my $xor;\n+    my $prefix_length;\n+\n+    $candidate = \'\';\n+    if ($date_str =~ /^([0-9]{1,4}[- \\/]*)?([A-Za-z]{3,9})/)\n+    {\n+        $candidate = lc $2;\n+    }\n+\n+    if ($candidate eq \'\')\n+    {\n+        return 0;\n+    }\n+\n+    $abbrev = substr $candidate, 0, 3;\n+    $full = $date_abbrev_hash{$abbrev};\n+\n+    # first three letters are not the start of a month\n+    if (!defined($full))\n+    {\n+        return 0;\n+    }\n+\n+    # find common prefix\n+    $xor = "$candidate" ^ "$full";\n+    $xor =~ /^\\0*/;\n+    $prefix_length = $+[0];\n+    \n+    # if the common prefix is the same as the full candidate, it is real\n+    if (length $candidate eq $prefix_length)\n+    {\n+        return 1;\n+    }\n+    \n+    return 0;\n+}\n+\n+$escape_excel_paranoid_flag = 0;\n+$escape_sci_flag = 1;\n+$escape_zeroes_flag = 1;\n+$escape_dates_flag = 1;\n+\n+# read in command line arguments\n+$num_files = 0;\n+for ($i = 0; $i < @ARGV; $i++)\n+{\n+    $field = $ARGV[$i];\n+\n+    if ($field =~ /^-/)\n+    {\n+        if ($field eq \'--paranoid\')\n+        {\n+            if ($escape_excel_paranoid_flag == 0)\n+            {\n+                $escape_excel_paranoid_flag = 1;\n+            }\n+            else\n+            {\n+                $escape_excel_paranoid_flag = 0;\n+            }\n+        }\n+        elsif ($field eq \'--no-sci\')\n+        {\n+            $escape_sci_flag = 0;\n+        }\n+        elsif ($field eq \'--no-zeroes\')\n+        {\n+            $escape_zeroes_flag = 0;\n+        }\n+        elsif ($field eq \'--no-dates\')\n+        {\n+            $escape_dates_flag = 0;\n+        }\n+        else\n+        {\n+            printf "ABORT -- unknown option %s\\n", $field;\n+            $syntax_error_flag = 1;\n+        }\n+    }\n+    else\n+    {\n+        if ($num_files == 1)\n+        {\n+            $outname = $field;\n+            $num_files++;\n+        }\n+        if ($num_files == 0)\n+        {\n+            $filename = $field;\n+            $num_files++;\n+        }\n+    }\n+}\n+\n+# default to stdin if no filename given\n+if ($num_files == 0)\n+{\n+    $filename = \'-\';\n+    $num_files = 1;\n+}\n+\n+\n+# print syntax error message\n+if ($num_files == 0 || $syntax_error_flag)\n+{\n+    printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\\n";\n+    printf STDERR "   Options:\\n";\n+    printf STDERR "      --no-dates   Do not escape text that looks like dates\\n";\n+    printf STDERR "      --no-sci     Do not escape > #E (ex: 12E4) or >11 digit integer parts\\n";\n+    printf STDERR "      --no-zeroes  Do not escape leading zeroes (ie. 012345)\\n";\n+    printf STDERR "      --paranoid   Escape *ALL* non-numeric text\\n";\n+    printf STDERR "                   WARNING -- Excel can take a LONG ti'..b',4}[- \\/]*Nov[A-Za-z]{0,6}([- \\/]*[0-9]{1,4})?)\\b/i ||\n+                  $strip_time =~ /\\b([0-9]{1,4}[- \\/]*Dec[A-Za-z]{0,6}([- \\/]*[0-9]{1,4})?)\\b/i)\n+              {\n+                  $temp = $1;\n+              \n+                  if (has_text_month($temp))\n+                  {\n+                      $date = $temp;\n+                  }\n+              }\n+\n+              # text date, month first\n+              elsif ($strip_time =~ /\\b(Jan[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Feb[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Mar[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Apr[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(May[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Jun[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Jul[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Aug[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Sep[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Oct[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Nov[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+                     $strip_time =~ /\\b(Dec[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i)\n+              {\n+                  $temp = $1;\n+\n+                  if (has_text_month($temp))\n+                  {\n+                      $date = $temp;\n+                  }\n+              }\n+\n+              # possibly a numeric date\n+              elsif ($strip_time =~ /\\b([0-9]{1,4}[- \\/]+[0-9]{1,2}[- \\/]+[0-9]{1,2})\\b/ ||\n+                     $strip_time =~ /\\b([0-9]{1,2}[- \\/]+[0-9]{1,4}[- \\/]+[0-9]{1,2})\\b/ ||\n+                     $strip_time =~ /\\b([0-9]{1,2}[- \\/]+[0-9]{1,2}[- \\/]+[0-9]{1,4})\\b/ ||\n+                     $strip_time =~ /\\b([0-9]{1,2}[- \\/]+[0-9]{1,4})\\b/ ||\n+                     $strip_time =~ /\\b([0-9]{1,4}[- \\/]+[0-9]{1,2})\\b/)\n+              {\n+                  $date = $1;\n+              }\n+              \n+              # be sure that date and time anchor the ends\n+              # mix of time and date\n+              if ($time =~ /\\S/ && $date =~ /\\S/)\n+              {\n+                  if ($array[$i] =~ /^\\Q$time\\E(.*)\\Q$date\\E$/ ||\n+                      $array[$i] =~ /^\\Q$date\\E(.*)\\Q$time\\E$/)\n+                  {\n+                      $middle = $1;\n+\n+                      # allow blank\n+                      # allow for purely whitespace\n+                      # allow for a single hyphen, slash, comma\n+                      #  allow for multiple spaces before and/or after\n+                      if ($middle eq \'\' ||\n+                          $middle =~ /^\\s+$/ ||\n+                          $middle =~ /^\\s*[-\\/,]\\s*$/)\n+                      {\n+                          $array[$i] = sprintf "=\\"%s\\"", $array[$i];\n+                      }\n+                  }\n+              }\n+              # only time\n+              elsif ($time =~ /\\S/)\n+              {\n+                  if ($array[$i] =~ /^\\Q$time\\E$/)\n+                  {\n+                      $array[$i] = sprintf "=\\"%s\\"", $array[$i];\n+                  }\n+              }\n+              # only date\n+              elsif ($date =~ /\\S/)\n+              {\n+                  if ($array[$i] =~ /^\\Q$date\\E$/)\n+                  {\n+                      $array[$i] = sprintf "=\\"%s\\"", $array[$i];\n+                  }\n+              }\n+          }\n+        }\n+    }\n+    \n+    # make the new escaped line\n+    $line_escaped = join "\\t", @array;\n+    \n+    # print it\n+    print OUTFILE "$line_escaped\\n";\n+}\n+close INFILE;\n+\n+close OUTFILE;\n'