comparison 2.4/lib/perl5/x86_64-linux-gnu-thread-multi/String/Approx.pm @ 18:1163c16cb3c0 draft

Uploaded
author plus91-technologies-pvt-ltd
date Mon, 02 Jun 2014 07:35:53 -0400
parents e3609c8714fb
children
comparison
equal deleted inserted replaced
17:5343ef57827f 18:1163c16cb3c0
1 package String::Approx;
2
3 require v5.8.0;
4
5 $VERSION = '3.27';
6
7 use strict;
8 local $^W = 1;
9
10 use Carp;
11 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
12
13 require Exporter;
14 require DynaLoader;
15
16 @ISA = qw(Exporter DynaLoader);
17
18 @EXPORT_OK = qw(amatch asubstitute aindex aslice arindex
19 adist adistr adistword adistrword);
20
21 bootstrap String::Approx $VERSION;
22
23 my $CACHE_MAX = 1000; # high water mark
24 my $CACHE_PURGE = 0.75; # purge this much of the least used
25 my $CACHE_N_PURGE; # purge this many of the least used
26
27 sub cache_n_purge () {
28 $CACHE_N_PURGE = $CACHE_MAX * $CACHE_PURGE;
29 $CACHE_N_PURGE = 1 if $CACHE_N_PURGE < 1;
30 return $CACHE_N_PURGE;
31 }
32
33 cache_n_purge();
34
35 sub cache_max (;$) {
36 if (@_ == 0) {
37 return $CACHE_MAX;
38 } else {
39 $CACHE_MAX = shift;
40 }
41 $CACHE_MAX = 0 if $CACHE_MAX < 0;
42 cache_n_purge();
43 }
44
45 sub cache_purge (;$) {
46 if (@_ == 0) {
47 return $CACHE_PURGE;
48 } else {
49 $CACHE_PURGE = shift;
50 }
51 if ($CACHE_PURGE < 0) {
52 $CACHE_PURGE = 0;
53 } elsif ($CACHE_PURGE > 1) {
54 $CACHE_PURGE = 1;
55 }
56 cache_n_purge();
57 }
58
59 my %_simple;
60 my %_simple_usage_count;
61
62 sub _cf_simple {
63 my $P = shift;
64
65 my @usage =
66 sort { $_simple_usage_count{$a} <=> $_simple_usage_count{$b} }
67 grep { $_ ne $P }
68 keys %_simple_usage_count;
69
70 # Make room, delete the least used entries.
71 $#usage = $CACHE_N_PURGE - 1;
72
73 delete @_simple_usage_count{@usage};
74 delete @_simple{@usage};
75 }
76
77 sub _simple {
78 my $P = shift;
79
80 my $_simple = new(__PACKAGE__, $P);
81
82 if ($CACHE_MAX) {
83 $_simple{$P} = $_simple unless exists $_simple{$P};
84
85 $_simple_usage_count{$P}++;
86
87 if (keys %_simple_usage_count > $CACHE_MAX) {
88 _cf_simple($P);
89 }
90 }
91
92 return ( $_simple );
93 }
94
95 sub _parse_param {
96 use integer;
97
98 my ($n, @param) = @_;
99 my %param;
100
101 foreach (@param) {
102 while ($_ ne '') {
103 s/^\s+//;
104 if (s/^([IDS]\s*)?(\d+)(\s*%)?//) {
105 my $k = defined $3 ? (($2-1) * $n) / 100 + ($2 ? 1 : 0) : $2;
106
107 if (defined $1) {
108 $param{$1} = $k;
109 } else {
110 $param{k} = $k;
111 }
112 } elsif (s/^initial_position\W+(\d+)\b//) {
113 $param{'initial_position'} = $1;
114 } elsif (s/^final_position\W+(\d+)\b//) {
115 $param{'final_position'} = $1;
116 } elsif (s/^position_range\W+(\d+)\b//) {
117 $param{'position_range'} = $1;
118 } elsif (s/^minimal_distance\b//) {
119 $param{'minimal_distance'} = 1;
120 } elsif (s/^i//) {
121 $param{ i } = 1;
122 } elsif (s/^g//) {
123 $param{ g } = 1;
124 } elsif (s/^\?//) {
125 $param{'?'} = 1;
126 } else {
127 warn "unknown parameter: '$_'\n";
128 return;
129 }
130 }
131 }
132
133 return %param;
134 }
135
136 my %_param_key;
137 my %_parsed_param;
138
139 my %_complex;
140 my %_complex_usage_count;
141
142 sub _cf_complex {
143 my $P = shift;
144
145 my @usage =
146 sort { $_complex_usage_count{$a} <=>
147 $_complex_usage_count{$b} }
148 grep { $_ ne $P }
149 keys %_complex_usage_count;
150
151 # Make room, delete the least used entries.
152 $#usage = $CACHE_N_PURGE - 1;
153
154 delete @_complex_usage_count{@usage};
155 delete @_complex{@usage};
156 }
157
158 sub _complex {
159 my ($P, @param) = @_;
160 unshift @param, length $P;
161 my $param = "@param";
162 my $_param_key;
163 my %param;
164 my $complex;
165 my $is_new;
166
167 unless (exists $_param_key{$param}) {
168 %param = _parse_param(@param);
169 $_parsed_param{$param} = { %param };
170 $_param_key{$param} = join(" ", %param);
171 } else {
172 %param = %{ $_parsed_param{$param} };
173 }
174
175 $_param_key = $_param_key{$param};
176
177 if ($CACHE_MAX) {
178 if (exists $_complex{$P}->{$_param_key}) {
179 $complex = $_complex{$P}->{$_param_key};
180 }
181 }
182
183 unless (defined $complex) {
184 if (exists $param{'k'}) {
185 $complex = new(__PACKAGE__, $P, $param{k});
186 } else {
187 $complex = new(__PACKAGE__, $P);
188 }
189 $_complex{$P}->{$_param_key} = $complex if $CACHE_MAX;
190 $is_new = 1;
191 }
192
193 if ($is_new) {
194 $complex->set_greedy unless exists $param{'?'};
195
196 $complex->set_insertions($param{'I'})
197 if exists $param{'I'};
198 $complex->set_deletions($param{'D'})
199 if exists $param{'D'};
200 $complex->set_substitutions($param{'S'})
201 if exists $param{'S'};
202
203 $complex->set_caseignore_slice
204 if exists $param{'i'};
205
206 $complex->set_text_initial_position($param{'initial_position'})
207 if exists $param{'initial_position'};
208
209 $complex->set_text_final_position($param{'final_position'})
210 if exists $param{'final_position'};
211
212 $complex->set_text_position_range($param{'position_range'})
213 if exists $param{'position_range'};
214
215 $complex->set_minimal_distance($param{'minimal_distance'})
216 if exists $param{'minimal_distance'};
217 }
218
219 if ($CACHE_MAX) {
220 $_complex_usage_count{$P}->{$_param_key}++;
221
222 # If our cache overfloweth.
223 if (scalar keys %_complex_usage_count > $CACHE_MAX) {
224 _cf_complex($P);
225 }
226 }
227
228 return ( $complex, %param );
229 }
230
231 sub cache_disable {
232 cache_max(0);
233 }
234
235 sub cache_flush_all {
236 my $old_purge = cache_purge();
237 cache_purge(1);
238 _cf_simple('');
239 _cf_complex('');
240 cache_purge($old_purge);
241 }
242
243 sub amatch {
244 my $P = shift;
245 return 1 unless length $P;
246 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
247 _complex($P, @{ shift(@_) }) : _simple($P))[0];
248
249 if (@_) {
250 if (wantarray) {
251 return grep { $a->match($_) } @_;
252 } else {
253 foreach (@_) {
254 return 1 if $a->match($_);
255 }
256 return 0;
257 }
258 }
259 if (defined $_) {
260 if (wantarray) {
261 return $a->match($_) ? $_ : undef;
262 } else {
263 return 1 if $a->match($_);
264 }
265 }
266 return $a->match($_) if defined $_;
267
268 warn "amatch: \$_ is undefined: what are you matching?\n";
269 return;
270 }
271
272 sub _find_substitute {
273 my ($ri, $rs, $i, $s, $S, $rn) = @_;
274
275 push @{ $ri }, $i;
276 push @{ $rs }, $s;
277
278 my $pre = substr($_, 0, $i);
279 my $old = substr($_, $i, $s);
280 my $suf = substr($_, $i + $s);
281 my $new = $S;
282
283 $new =~ s/\$\`/$pre/g;
284 $new =~ s/\$\&/$old/g;
285 $new =~ s/\$\'/$suf/g;
286
287 push @{ $rn }, $new;
288 }
289
290 sub _do_substitute {
291 my ($rn, $ri, $rs, $rS) = @_;
292
293 my $d = 0;
294 my $n = $_;
295
296 foreach my $i (0..$#$rn) {
297 substr($n, $ri->[$i] + $d, $rs->[$i]) = $rn->[$i];
298 $d += length($rn->[$i]) - $rs->[$i];
299 }
300
301 push @{ $rS }, $n;
302 }
303
304 sub asubstitute {
305 my $P = shift;
306 my $S = shift;
307 my ($a, %p) =
308 (@_ && ref $_[0] eq 'ARRAY') ?
309 _complex($P, @{ shift(@_) }) : _simple($P);
310
311 my ($i, $s, @i, @s, @n, @S);
312
313 if (@_) {
314 if (exists $p{ g }) {
315 foreach (@_) {
316 @s = @i = @n = ();
317 while (($i, $s) = $a->slice_next($_)) {
318 if (defined $i) {
319 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
320 }
321 }
322 _do_substitute(\@n, \@i, \@s, \@S) if @n;
323 }
324 } else {
325 foreach (@_) {
326 @s = @i = @n = ();
327 ($i, $s) = $a->slice($_);
328 if (defined $i) {
329 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
330 _do_substitute(\@n, \@i, \@s, \@S);
331 }
332 }
333 }
334 return @S;
335 } elsif (defined $_) {
336 if (exists $p{ g }) {
337 while (($i, $s) = $a->slice_next($_)) {
338 if (defined $i) {
339 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
340 }
341 }
342 _do_substitute(\@n, \@i, \@s, \@S) if @n;
343 } else {
344 ($i, $s) = $a->slice($_);
345 if (defined $i) {
346 _find_substitute(\@i, \@s, $i, $s, $S, \@n);
347 _do_substitute(\@n, \@i, \@s, \@S);
348 }
349 }
350 return $_ = $n[0];
351 } else {
352 warn "asubstitute: \$_ is undefined: what are you substituting?\n";
353 return;
354 }
355 }
356
357 sub aindex {
358 my $P = shift;
359 return 0 unless length $P;
360 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
361 _complex($P, @{ shift(@_) }) : _simple($P))[0];
362
363 $a->set_greedy; # The *first* match, thank you.
364
365 if (@_) {
366 if (wantarray) {
367 return map { $a->index($_) } @_;
368 } else {
369 return $a->index($_[0]);
370 }
371 }
372 return $a->index($_) if defined $_;
373
374 warn "aindex: \$_ is undefined: what are you indexing?\n";
375 return;
376 }
377
378 sub aslice {
379 my $P = shift;
380 return (0, 0) unless length $P;
381 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
382 _complex($P, @{ shift(@_) }) : _simple($P))[0];
383
384 $a->set_greedy; # The *first* match, thank you.
385
386 if (@_) {
387 return map { [ $a->slice($_) ] } @_;
388 }
389 return $a->slice($_) if defined $_;
390
391 warn "aslice: \$_ is undefined: what are you slicing?\n";
392 return;
393 }
394
395 sub _adist {
396 my $s0 = shift;
397 my $s1 = shift;
398 my ($aslice) = aslice($s0, ['minimal_distance', @_], $s1);
399 my ($index, $size, $distance) = @$aslice;
400 my ($l0, $l1) = map { length } ($s0, $s1);
401 return $l0 <= $l1 ? $distance : -$distance;
402 }
403
404 sub adist {
405 my $a0 = shift;
406 my $a1 = shift;
407 if (length($a0) == 0) {
408 return length($a1);
409 }
410 if (length($a1) == 0) {
411 return length($a0);
412 }
413 my @m = ref $_[0] eq 'ARRAY' ? @{shift()} : ();
414 if (ref $a0 eq 'ARRAY') {
415 if (ref $a1 eq 'ARRAY') {
416 return [ map { adist($a0, $_, @m) } @{$a1} ];
417 } else {
418 return [ map { _adist($_, $a1, @m) } @{$a0} ];
419 }
420 } elsif (ref $a1 eq 'ARRAY') {
421 return [ map { _adist($a0, $_, @m) } @{$a1} ];
422 } else {
423 if (wantarray) {
424 return map { _adist($a0, $_, @m) } ($a1, @_);
425 } else {
426 return _adist($a0, $a1, @m);
427 }
428 }
429 }
430
431 sub adistr {
432 my $a0 = shift;
433 my $a1 = shift;
434 my @m = ref $_[0] eq 'ARRAY' ? shift : ();
435 if (ref $a0 eq 'ARRAY') {
436 if (ref $a1 eq 'ARRAY') {
437 my $l0 = length();
438 return $l0 ? [ map { adist($a0, $_, @m) }
439 @{$a1} ] :
440 [ ];
441 } else {
442 return [ map { my $l0 = length();
443 $l0 ? _adist($_, $a1, @m) / $l0 : undef
444 } @{$a0} ];
445 }
446 } elsif (ref $a1 eq 'ARRAY') {
447 my $l0 = length($a0);
448 return [] unless $l0;
449 return [ map { _adist($a0, $_, @m) / $l0 } @{$a1} ];
450 } else {
451 my $l0 = length($a0);
452 if (wantarray) {
453 return map { $l0 ? _adist($a0, $_, @m) / $l0 : undef } ($a1, @_);
454 } else {
455 return undef unless $l0;
456 return _adist($a0, $a1, @m) / $l0;
457 }
458 }
459 }
460
461 sub adistword {
462 return adist($_[0], $_[1], ['position_range=0']);
463 }
464
465 sub adistrword {
466 return adistr($_[0], $_[1], ['position_range=0']);
467 }
468
469 sub arindex {
470 my $P = shift;
471 my $l = length $P;
472 return 0 unless $l;
473 my $R = reverse $P;
474 my $a = ((@_ && ref $_[0] eq 'ARRAY') ?
475 _complex($R, @{ shift(@_) }) : _simple($R))[0];
476
477 $a->set_greedy; # The *first* match, thank you.
478
479 if (@_) {
480 if (wantarray) {
481 return map {
482 my $aindex = $a->index(scalar reverse());
483 $aindex == -1 ? $aindex : (length($_) - $aindex - $l);
484 } @_;
485 } else {
486 my $aindex = $a->index(scalar reverse $_[0]);
487 return $aindex == -1 ? $aindex : (length($_[0]) - $aindex - $l);
488 }
489 }
490 if (defined $_) {
491 my $aindex = $a->index(scalar reverse());
492 return $aindex == -1 ? $aindex : (length($_) - $aindex - $l);
493 }
494
495 warn "arindex: \$_ is undefined: what are you indexing?\n";
496 return;
497 }
498
499 1;
500 __END__
501 =pod
502
503 =head1 NAME
504
505 String::Approx - Perl extension for approximate matching (fuzzy matching)
506
507 =head1 SYNOPSIS
508
509 use String::Approx 'amatch';
510
511 print if amatch("foobar");
512
513 my @matches = amatch("xyzzy", @inputs);
514
515 my @catches = amatch("plugh", ['2'], @inputs);
516
517 =head1 DESCRIPTION
518
519 String::Approx lets you match and substitute strings approximately.
520 With this you can emulate errors: typing errorrs, speling errors,
521 closely related vocabularies (colour color), genetic mutations (GAG
522 ACT), abbreviations (McScot, MacScot).
523
524 NOTE: String::Approx suits the task of B<string matching>, not
525 B<string comparison>, and it works for B<strings>, not for B<text>.
526
527 If you want to compare strings for similarity, you probably just want
528 the Levenshtein edit distance (explained below), the Text::Levenshtein
529 and Text::LevenshteinXS modules in CPAN. See also Text::WagnerFischer
530 and Text::PhraseDistance. (There are functions for this in String::Approx,
531 e.g. adist(), but their results sometimes differ from the bare Levenshtein
532 et al.)
533
534 If you want to compare things like text or source code, consisting of
535 B<words> or B<tokens> and B<phrases> and B<sentences>, or
536 B<expressions> and B<statements>, you should probably use some other
537 tool than String::Approx, like for example the standard UNIX diff(1)
538 tool, or the Algorithm::Diff module from CPAN.
539
540 The measure of B<approximateness> is the I<Levenshtein edit distance>.
541 It is the total number of "edits": insertions,
542
543 word world
544
545 deletions,
546
547 monkey money
548
549 and substitutions
550
551 sun fun
552
553 required to transform a string to another string. For example, to
554 transform I<"lead"> into I<"gold">, you need three edits:
555
556 lead gead goad gold
557
558 The edit distance of "lead" and "gold" is therefore three, or 75%.
559
560 B<String::Approx> uses the Levenshtein edit distance as its measure, but
561 String::Approx is not well-suited for comparing strings of different
562 length, in other words, if you want a "fuzzy eq", see above.
563 String::Approx is more like regular expressions or index(), it finds
564 substrings that are close matches.>
565
566 =head1 MATCH
567
568 use String::Approx 'amatch';
569
570 $matched = amatch("pattern")
571 $matched = amatch("pattern", [ modifiers ])
572
573 $any_matched = amatch("pattern", @inputs)
574 $any_matched = amatch("pattern", [ modifiers ], @inputs)
575
576 @match = amatch("pattern")
577 @match = amatch("pattern", [ modifiers ])
578
579 @matches = amatch("pattern", @inputs)
580 @matches = amatch("pattern", [ modifiers ], @inputs)
581
582 Match B<pattern> approximately. In list context return the matched
583 B<@inputs>. If no inputs are given, match against the B<$_>. In scalar
584 context return true if I<any> of the inputs match, false if none match.
585
586 Notice that the pattern is a string. Not a regular expression. None
587 of the regular expression notations (^, ., *, and so on) work. They
588 are characters just like the others. Note-on-note: some limited form
589 of I<"regular expressionism"> is planned in future: for example
590 character classes ([abc]) and I<any-chars> (.). But that feature will
591 be turned on by a special I<modifier> (just a guess: "r"), so there
592 should be no backward compatibility problem.
593
594 Notice also that matching is not symmetric. The inputs are matched
595 against the pattern, not the other way round. In other words: the
596 pattern can be a substring, a submatch, of an input element. An input
597 element is always a superstring of the pattern.
598
599 =head2 MODIFIERS
600
601 With the modifiers you can control the amount of approximateness and
602 certain other control variables. The modifiers are one or more
603 strings, for example B<"i">, within a string optionally separated by
604 whitespace. The modifiers are inside an anonymous array: the B<[ ]>
605 in the syntax are not notational, they really do mean B<[ ]>, for
606 example B<[ "i", "2" ]>. B<["2 i"]> would be identical.
607
608 The implicit default approximateness is 10%, rounded up. In other
609 words: every tenth character in the pattern may be an error, an edit.
610 You can explicitly set the maximum approximateness by supplying a
611 modifier like
612
613 number
614 number%
615
616 Examples: B<"3">, B<"15%">.
617
618 Note that C<0%> is not rounded up, it is equal to C<0>.
619
620 Using a similar syntax you can separately control the maximum number
621 of insertions, deletions, and substitutions by prefixing the numbers
622 with I, D, or S, like this:
623
624 Inumber
625 Inumber%
626 Dnumber
627 Dnumber%
628 Snumber
629 Snumber%
630
631 Examples: B<"I2">, B<"D20%">, B<"S0">.
632
633 You can ignore case (B<"A"> becames equal to B<"a"> and vice versa)
634 by adding the B<"i"> modifier.
635
636 For example
637
638 [ "i 25%", "S0" ]
639
640 means I<ignore case>, I<allow every fourth character to be "an edit">,
641 but allow I<no substitutions>. (See L<NOTES> about disallowing
642 substitutions or insertions.)
643
644 NOTE: setting C<I0 D0 S0> is not equivalent to using index().
645 If you want to use index(), use index().
646
647 =head1 SUBSTITUTE
648
649 use String::Approx 'asubstitute';
650
651 @substituted = asubstitute("pattern", "replacement")
652 @substituted = asubstitute("pattern", "replacement", @inputs)
653 @substituted = asubstitute("pattern", "replacement", [ modifiers ])
654 @substituted = asubstitute("pattern", "replacement",
655 [ modifiers ], @inputs)
656
657 Substitute approximate B<pattern> with B<replacement> and return as a
658 list <copies> of B<@inputs>, the substitutions having been made on the
659 elements that did match the pattern. If no inputs are given,
660 substitute in the B<$_>. The replacement can contain magic strings
661 B<$&>, B<$`>, B<$'> that stand for the matched string, the string
662 before it, and the string after it, respectively. All the other
663 arguments are as in C<amatch()>, plus one additional modifier, B<"g">
664 which means substitute globally (all the matches in an element and not
665 just the first one, as is the default).
666
667 See L<BAD NEWS> about the unfortunate stinginess of C<asubstitute()>.
668
669 =head1 INDEX
670
671 use String::Approx 'aindex';
672
673 $index = aindex("pattern")
674 @indices = aindex("pattern", @inputs)
675 $index = aindex("pattern", [ modifiers ])
676 @indices = aindex("pattern", [ modifiers ], @inputs)
677
678 Like C<amatch()> but returns the index/indices at which the pattern
679 matches approximately. In list context and if C<@inputs> are used,
680 returns a list of indices, one index for each input element.
681 If there's no approximate match, C<-1> is returned as the index.
682
683 NOTE: if there is character repetition (e.g. "aa") either in
684 the pattern or in the text, the returned index might start
685 "too early". This is consistent with the goal of the module
686 of matching "as early as possible", just like regular expressions
687 (that there might be a "less approximate" match starting later is
688 of somewhat irrelevant).
689
690 There's also backwards-scanning C<arindex()>.
691
692 =head1 SLICE
693
694 use String::Approx 'aslice';
695
696 ($index, $size) = aslice("pattern")
697 ([$i0, $s0], ...) = aslice("pattern", @inputs)
698 ($index, $size) = aslice("pattern", [ modifiers ])
699 ([$i0, $s0], ...) = aslice("pattern", [ modifiers ], @inputs)
700
701 Like C<aindex()> but returns also the size (length) of the match.
702 If the match fails, returns an empty list (when matching against C<$_>)
703 or an empty anonymous list corresponding to the particular input.
704
705 NOTE: size of the match will very probably be something you did not
706 expect (such as longer than the pattern, or a negative number). This
707 may or may not be fixed in future releases. Also the beginning of the
708 match may vary from the expected as with aindex(), see above.
709
710 If the modifier
711
712 "minimal_distance"
713
714 is used, the minimal possible edit distance is returned as the
715 third element:
716
717 ($index, $size, $distance) = aslice("pattern", [ modifiers ])
718 ([$i0, $s0, $d0], ...) = aslice("pattern", [ modifiers ], @inputs)
719
720 =head1 DISTANCE
721
722 use String::Approx 'adist';
723
724 $dist = adist("pattern", $input);
725 @dist = adist("pattern", @input);
726
727 Return the I<edit distance> or distances between the pattern and the
728 input or inputs. Zero edit distance means exact match. (Remember
729 that the match can 'float' in the inputs, the match is a substring
730 match.) If the pattern is longer than the input or inputs, the
731 returned distance or distances is or are negative.
732
733 use String::Approx 'adistr';
734
735 $dist = adistr("pattern", $input);
736 @dist = adistr("pattern", @inputs);
737
738 Return the B<relative> I<edit distance> or distances between the
739 pattern and the input or inputs. Zero relative edit distance means
740 exact match, one means completely different. (Remember that the
741 match can 'float' in the inputs, the match is a substring match.) If
742 the pattern is longer than the input or inputs, the returned distance
743 or distances is or are negative.
744
745 You can use adist() or adistr() to sort the inputs according to their
746 approximateness:
747
748 my %d;
749 @d{@inputs} = map { abs } adistr("pattern", @inputs);
750 my @d = sort { $d{$a} <=> $d{$b} } @inputs;
751
752 Now C<@d> contains the inputs, the most like C<"pattern"> first.
753
754 =head1 CONTROLLING THE CACHE
755
756 C<String::Approx> maintains a LU (least-used) cache that holds the
757 'matching engines' for each instance of a I<pattern+modifiers>. The
758 cache is intended to help the case where you match a small set of
759 patterns against a large set of string. However, the more engines you
760 cache the more you eat memory. If you have a lot of different
761 patterns or if you have a lot of memory to burn, you may want to
762 control the cache yourself. For example, allowing a larger cache
763 consumes more memory but probably runs a little bit faster since the
764 cache fills (and needs flushing) less often.
765
766 The cache has two parameters: I<max> and I<purge>. The first one
767 is the maximum size of the cache and the second one is the cache
768 flushing ratio: when the number of cache entries exceeds I<max>,
769 I<max> times I<purge> cache entries are flushed. The default
770 values are 1000 and 0.75, respectively, which means that when
771 the 1001st entry would be cached, 750 least used entries will
772 be removed from the cache. To access the parameters you can
773 use the calls
774
775 $now_max = String::Approx::cache_max();
776 String::Approx::cache_max($new_max);
777
778 $now_purge = String::Approx::cache_purge();
779 String::Approx::cache_purge($new_purge);
780
781 $limit = String::Approx::cache_n_purge();
782
783 To be honest, there are actually B<two> caches: the first one is used
784 far the patterns with no modifiers, the second one for the patterns
785 with pattern modifiers. Using the standard parameters you will
786 therefore actually cache up to 2000 entries. The above calls control
787 both caches for the same price.
788
789 To disable caching completely use
790
791 String::Approx::cache_disable();
792
793 Note that this doesn't flush any possibly existing cache entries,
794 to do that use
795
796 String::Approx::cache_flush_all();
797
798 =head1 NOTES
799
800 Because matching is by I<substrings>, not by whole strings, insertions
801 and substitutions produce often very similar results: "abcde" matches
802 "axbcde" either by insertion B<or> substitution of "x".
803
804 The maximum edit distance is also the maximum number of edits.
805 That is, the B<"I2"> in
806
807 amatch("abcd", ["I2"])
808
809 is useless because the maximum edit distance is (implicitly) 1.
810 You may have meant to say
811
812 amatch("abcd", ["2D1S1"])
813
814 or something like that.
815
816 If you want to simulate transposes
817
818 feet fete
819
820 you need to allow at least edit distance of two because in terms of
821 our edit primitives a transpose is first one deletion and then one
822 insertion.
823
824 =head2 TEXT POSITION
825
826 The starting and ending positions of matching, substituting, indexing, or
827 slicing can be changed from the beginning and end of the input(s) to
828 some other positions by using either or both of the modifiers
829
830 "initial_position=24"
831 "final_position=42"
832
833 or the both the modifiers
834
835 "initial_position=24"
836 "position_range=10"
837
838 By setting the B<"position_range"> to be zero you can limit
839 (anchor) the operation to happen only once (if a match is possible)
840 at the position.
841
842 =head1 VERSION
843
844 Major release 3.
845
846 =head1 CHANGES FROM VERSION 2
847
848 =head2 GOOD NEWS
849
850 =over 4
851
852 =item The version 3 is 2-3 times faster than version 2
853
854 =item No pattern length limitation
855
856 The algorithm is independent on the pattern length: its time
857 complexity is I<O(kn)>, where I<k> is the number of edits and I<n> the
858 length of the text (input). The preprocessing of the pattern will of
859 course take some I<O(m)> (I<m> being the pattern length) time, but
860 C<amatch()> and C<asubstitute()> cache the result of this
861 preprocessing so that it is done only once per pattern.
862
863 =back
864
865 =head2 BAD NEWS
866
867 =over 4
868
869 =item You do need a C compiler to install the module
870
871 Perl's regular expressions are no more used; instead a faster and more
872 scalable algorithm written in C is used.
873
874 =item C<asubstitute()> is now always stingy
875
876 The string matched and substituted is now always stingy, as short
877 as possible. It used to be as long as possible. This is an unfortunate
878 change stemming from switching the matching algorithm. Example: with
879 edit distance of two and substituting for B<"word"> from B<"cork"> and
880 B<"wool"> previously did match B<"cork"> and B<"wool">. Now it does
881 match B<"or"> and B<"wo">. As little as possible, or, in other words,
882 with as much approximateness, as many edits, as possible. Because
883 there is no I<need> to match the B<"c"> of B<"cork">, it is not matched.
884
885 =item no more C<aregex()> because regular expressions are no more used
886
887 =item no more C<compat1> for String::Approx version 1 compatibility
888
889 =back
890
891 =head1 ACKNOWLEDGEMENTS
892
893 The following people have provided valuable test cases, documentation
894 clarifications, and other feedback:
895
896 Jared August, Arthur Bergman, Anirvan Chatterjee, Steve A. Chervitz,
897 Aldo Calpini, David Curiel, Teun van den Dool, Alberto Fontaneda,
898 Rob Fugina, Dmitrij Frishman, Lars Gregersen, Kevin Greiner,
899 B. Elijah Griffin, Mike Hanafey, Mitch Helle, Ricky Houghton,
900 'idallen', Helmut Jarausch, Damian Keefe, Ben Kennedy, Craig Kelley,
901 Franz Kirsch, Dag Kristian, Mark Land, J. D. Laub, John P. Linderman,
902 Tim Maher, Juha Muilu, Sergey Novoselov, Andy Oram, Ji Y Park,
903 Eric Promislow, Nikolaus Rath, Stefan Ram, Slaven Rezic,
904 Dag Kristian Rognlien, Stewart Russell, Slaven Rezic, Chris Rosin,
905 Pasha Sadri, Ilya Sandler, Bob J.A. Schijvenaars, Ross Smith,
906 Frank Tobin, Greg Ward, Rich Williams, Rick Wise.
907
908 The matching algorithm was developed by Udi Manber, Sun Wu, and Burra
909 Gopal in the Department of Computer Science, University of Arizona.
910
911 =head1 AUTHOR
912
913 Jarkko Hietaniemi <jhi@iki.fi>
914
915 =head1 COPYRIGHT AND LICENSE
916
917 Copyright 2001-2013 by Jarkko Hietaniemi
918
919 This library is free software; you can redistribute it and/or modify
920 under either the terms of the Artistic License 2.0, or the GNU Library
921 General Public License, Version 2. See the files Artistic and LGPL
922 for more details.
923
924 Furthermore: no warranties or obligations of any kind are given, and
925 the separate file F<COPYRIGHT> must be included intact in all copies
926 and derived materials.
927
928 =cut