Previous changeset 0:45f9b77eda87 (2017-02-17) Next changeset 2:482c23a5abfe (2017-02-17) |
Commit message:
Uploaded Escape Excel Perl script |
added:
escape_excel.pl |
b |
diff -r 45f9b77eda87 -r 7726adcf91c6 escape_excel.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/escape_excel.pl Fri Feb 17 16:38:12 2017 -0500 |
[ |
b'@@ -0,0 +1,449 @@\n+#!/usr/bin/perl -w\n+\n+use Scalar::Util qw(looks_like_number);\n+\n+$date_abbrev_hash{\'jan\'} = \'january\';\n+$date_abbrev_hash{\'feb\'} = \'february\';\n+$date_abbrev_hash{\'mar\'} = \'march\';\n+$date_abbrev_hash{\'apr\'} = \'april\';\n+$date_abbrev_hash{\'may\'} = \'may\';\n+$date_abbrev_hash{\'jun\'} = \'jun\';\n+$date_abbrev_hash{\'jul\'} = \'july\';\n+$date_abbrev_hash{\'aug\'} = \'august\';\n+$date_abbrev_hash{\'sep\'} = \'september\';\n+$date_abbrev_hash{\'oct\'} = \'october\';\n+$date_abbrev_hash{\'nov\'} = \'november\';\n+$date_abbrev_hash{\'dec\'} = \'december\';\n+\n+\n+sub is_number\n+{\n+ # use what Perl thinks is a number first\n+ if (looks_like_number($_[0]))\n+ {\n+ # Perl treats infinities as numbers, Excel does not\n+ if ($_[0] =~ /^[+-]*inf/)\n+ {\n+ return 0;\n+ }\n+ \n+ return 1;\n+ }\n+\n+ # Perl cannot handle American comma separators within long numbers.\n+ # Excel does, so we have to check for it.\n+ # Excel doesn\'t handle European dot separators, at least not when it is\n+ # set to the US locale (my test environment). I am going to leave this\n+ # unsupported for now.\n+ #\n+ return ($_[0] =~ /^([+-]?)[0-9]+(,\\d\\d\\d)*([Ee]([+-]?[0-9]+))?$/);\n+}\n+\n+\n+sub has_text_month\n+{\n+ my $date_str = $_[0];\n+ my $abbrev;\n+ my $full;\n+ my $xor;\n+ my $prefix_length;\n+\n+ $candidate = \'\';\n+ if ($date_str =~ /^([0-9]{1,4}[- \\/]*)?([A-Za-z]{3,9})/)\n+ {\n+ $candidate = lc $2;\n+ }\n+\n+ if ($candidate eq \'\')\n+ {\n+ return 0;\n+ }\n+\n+ $abbrev = substr $candidate, 0, 3;\n+ $full = $date_abbrev_hash{$abbrev};\n+\n+ # first three letters are not the start of a month\n+ if (!defined($full))\n+ {\n+ return 0;\n+ }\n+\n+ # find common prefix\n+ $xor = "$candidate" ^ "$full";\n+ $xor =~ /^\\0*/;\n+ $prefix_length = $+[0];\n+ \n+ # if the common prefix is the same as the full candidate, it is real\n+ if (length $candidate eq $prefix_length)\n+ {\n+ return 1;\n+ }\n+ \n+ return 0;\n+}\n+\n+$escape_excel_paranoid_flag = 0;\n+$escape_sci_flag = 1;\n+$escape_zeroes_flag = 1;\n+$escape_dates_flag = 1;\n+\n+# read in command line arguments\n+$num_files = 0;\n+for ($i = 0; $i < @ARGV; $i++)\n+{\n+ $field = $ARGV[$i];\n+\n+ if ($field =~ /^-/)\n+ {\n+ if ($field eq \'--paranoid\')\n+ {\n+ if ($escape_excel_paranoid_flag == 0)\n+ {\n+ $escape_excel_paranoid_flag = 1;\n+ }\n+ else\n+ {\n+ $escape_excel_paranoid_flag = 0;\n+ }\n+ }\n+ elsif ($field eq \'--no-sci\')\n+ {\n+ $escape_sci_flag = 0;\n+ }\n+ elsif ($field eq \'--no-zeroes\')\n+ {\n+ $escape_zeroes_flag = 0;\n+ }\n+ elsif ($field eq \'--no-dates\')\n+ {\n+ $escape_dates_flag = 0;\n+ }\n+ else\n+ {\n+ printf "ABORT -- unknown option %s\\n", $field;\n+ $syntax_error_flag = 1;\n+ }\n+ }\n+ else\n+ {\n+ if ($num_files == 1)\n+ {\n+ $outname = $field;\n+ $num_files++;\n+ }\n+ if ($num_files == 0)\n+ {\n+ $filename = $field;\n+ $num_files++;\n+ }\n+ }\n+}\n+\n+# default to stdin if no filename given\n+if ($num_files == 0)\n+{\n+ $filename = \'-\';\n+ $num_files = 1;\n+}\n+\n+\n+# print syntax error message\n+if ($num_files == 0 || $syntax_error_flag)\n+{\n+ printf STDERR "Syntax: escape_excel.pl [options] tab_delimited_input.txt [output.txt]\\n";\n+ printf STDERR " Options:\\n";\n+ printf STDERR " --no-dates Do not escape text that looks like dates\\n";\n+ printf STDERR " --no-sci Do not escape > #E (ex: 12E4) or >11 digit integer parts\\n";\n+ printf STDERR " --no-zeroes Do not escape leading zeroes (ie. 012345)\\n";\n+ printf STDERR " --paranoid Escape *ALL* non-numeric text\\n";\n+ printf STDERR " WARNING -- Excel can take a LONG ti'..b',4}[- \\/]*Nov[A-Za-z]{0,6}([- \\/]*[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b([0-9]{1,4}[- \\/]*Dec[A-Za-z]{0,6}([- \\/]*[0-9]{1,4})?)\\b/i)\n+ {\n+ $temp = $1;\n+ \n+ if (has_text_month($temp))\n+ {\n+ $date = $temp;\n+ }\n+ }\n+\n+ # text date, month first\n+ elsif ($strip_time =~ /\\b(Jan[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Feb[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Mar[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Apr[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(May[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Jun[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Jul[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Aug[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Sep[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Oct[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Nov[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i ||\n+ $strip_time =~ /\\b(Dec[A-Za-z]{0,6}[- \\/]*[0-9]{1,4}([- \\/]+[0-9]{1,4})?)\\b/i)\n+ {\n+ $temp = $1;\n+\n+ if (has_text_month($temp))\n+ {\n+ $date = $temp;\n+ }\n+ }\n+\n+ # possibly a numeric date\n+ elsif ($strip_time =~ /\\b([0-9]{1,4}[- \\/]+[0-9]{1,2}[- \\/]+[0-9]{1,2})\\b/ ||\n+ $strip_time =~ /\\b([0-9]{1,2}[- \\/]+[0-9]{1,4}[- \\/]+[0-9]{1,2})\\b/ ||\n+ $strip_time =~ /\\b([0-9]{1,2}[- \\/]+[0-9]{1,2}[- \\/]+[0-9]{1,4})\\b/ ||\n+ $strip_time =~ /\\b([0-9]{1,2}[- \\/]+[0-9]{1,4})\\b/ ||\n+ $strip_time =~ /\\b([0-9]{1,4}[- \\/]+[0-9]{1,2})\\b/)\n+ {\n+ $date = $1;\n+ }\n+ \n+ # be sure that date and time anchor the ends\n+ # mix of time and date\n+ if ($time =~ /\\S/ && $date =~ /\\S/)\n+ {\n+ if ($array[$i] =~ /^\\Q$time\\E(.*)\\Q$date\\E$/ ||\n+ $array[$i] =~ /^\\Q$date\\E(.*)\\Q$time\\E$/)\n+ {\n+ $middle = $1;\n+\n+ # allow blank\n+ # allow for purely whitespace\n+ # allow for a single hyphen, slash, comma\n+ # allow for multiple spaces before and/or after\n+ if ($middle eq \'\' ||\n+ $middle =~ /^\\s+$/ ||\n+ $middle =~ /^\\s*[-\\/,]\\s*$/)\n+ {\n+ $array[$i] = sprintf "=\\"%s\\"", $array[$i];\n+ }\n+ }\n+ }\n+ # only time\n+ elsif ($time =~ /\\S/)\n+ {\n+ if ($array[$i] =~ /^\\Q$time\\E$/)\n+ {\n+ $array[$i] = sprintf "=\\"%s\\"", $array[$i];\n+ }\n+ }\n+ # only date\n+ elsif ($date =~ /\\S/)\n+ {\n+ if ($array[$i] =~ /^\\Q$date\\E$/)\n+ {\n+ $array[$i] = sprintf "=\\"%s\\"", $array[$i];\n+ }\n+ }\n+ }\n+ }\n+ }\n+ \n+ # make the new escaped line\n+ $line_escaped = join "\\t", @array;\n+ \n+ # print it\n+ print OUTFILE "$line_escaped\\n";\n+}\n+close INFILE;\n+\n+close OUTFILE;\n' |