1 |
package oreilly_de_catalog::wait_filter; |
2 |
use strict; |
3 |
use charnames ":full"; |
4 |
|
5 |
|
6 |
# we have taken all kinds of precautions to make sure, the strings |
7 |
# going through these filters will be upgraded. But we still get |
8 |
# decomposed characters from somewhere. We must have much more |
9 |
# paranoia in the next version |
10 |
|
11 |
sub WAIT::Filter::OR_lc_20020125 { |
12 |
|
13 |
my $s = shift; |
14 |
return unless defined $s; |
15 |
return unless length $s; |
16 |
my $lc = lc $s; |
17 |
if ($lc =~ /[^\040-\177]/) { |
18 |
$lc =~ s/\N{LATIN SMALL LETTER AE}/ae/gs; |
19 |
$lc =~ s/\N{LATIN SMALL LETTER A WITH DIAERESIS}/ae/gs; |
20 |
$lc =~ s/\N{LATIN SMALL LETTER O WITH DIAERESIS}/oe/gs; |
21 |
$lc =~ s/\N{LATIN SMALL LETTER U WITH DIAERESIS}/ue/gs; |
22 |
$lc =~ s/\N{LATIN SMALL LETTER SHARP S}/ss/gs; |
23 |
$lc =~ s/\N{SUPERSCRIPT ONE}//gs; |
24 |
$lc =~ s/\N{NO-BREAK SPACE}/ /gs; |
25 |
$lc =~ s/\N{EN DASH}/-/gs; |
26 |
$lc =~ s/\N{EM DASH}/-/gs; |
27 |
$lc =~ s/\N{SOFT HYPHEN}//g; |
28 |
$lc =~ s/\N{VULGAR FRACTION ONE QUARTER}//g; |
29 |
} else { |
30 |
return $lc; |
31 |
} |
32 |
if ($lc =~ /[^\040-\177]/) { |
33 |
my $nlc = ""; |
34 |
for my $c (split //, $lc){ |
35 |
my $ord = ord $c; |
36 |
if ($ord < 128){ |
37 |
$nlc .= $c; |
38 |
next; |
39 |
} |
40 |
my $cname = charnames::viacode($ord); |
41 |
unless ($cname) { |
42 |
# illegal |
43 |
next; |
44 |
} |
45 |
my $isletter = $c =~ /\p{Word}/; |
46 |
unless ($isletter) { |
47 |
$nlc .= $c; |
48 |
next; |
49 |
} |
50 |
my($repl) = $cname =~ /^LATIN SMALL LETTER (\S+)/; |
51 |
unless (defined $repl && length $repl){ |
52 |
warn "no repl after cname[$cname]" ; |
53 |
$repl = ""; |
54 |
if (length $s > 60) { |
55 |
warn sprintf "substr(s)[%s]", substr($s,0,60); |
56 |
} else { |
57 |
warn "s[$s]"; |
58 |
} |
59 |
} |
60 |
$repl = lc $repl; |
61 |
$nlc .= $repl; |
62 |
} |
63 |
$lc = $nlc; |
64 |
} |
65 |
warn " OR_lc: $lc\n" if oreilly_de_catalog::config::ULTRA_VERBOSE(); |
66 |
$lc; |
67 |
} |
68 |
|
69 |
sub WAIT::Filter::OR_tr_20020124 { |
70 |
my $s = shift; |
71 |
$s =~ s/\N{SOFT HYPHEN}//g; |
72 |
$s =~ s/\N{SUPERSCRIPT ONE}//gs; |
73 |
$s =~ s/\N{NO-BREAK SPACE}/ /gs; |
74 |
$s =~ s/[^\p{Word}\-]+/ /g; |
75 |
warn " OR_tr: $s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE(); |
76 |
$s; |
77 |
} |
78 |
|
79 |
sub WAIT::Filter::OR_minus_20020311 { |
80 |
my $s = shift; |
81 |
return if $s eq "-"; # protect against sterm "chopblanks-attribut148" |
82 |
# showing about everything with 99% |
83 |
my @s; |
84 |
if ($s =~ /-/){ # web-anwendungen webanwendungen |
85 |
my $ssans = $s; |
86 |
$ssans =~ s/-//g; |
87 |
if ($s =~ /^-/ || $s =~ /-\z/) { |
88 |
@s = $ssans; |
89 |
} else { |
90 |
@s = ($s, $ssans); |
91 |
} |
92 |
} else { |
93 |
@s = $s; # let ordinary words through or we're just out of business |
94 |
} |
95 |
warn "OR_minus: @s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE(); |
96 |
return @s; |
97 |
} |
98 |
|
99 |
sub WAIT::Filter::OR_mixedonly_20020221 { |
100 |
my $s = shift; |
101 |
return if $s =~ /^[\-.]$/; # protect against sterm |
102 |
# "chopblanks-attribut148" showing about |
103 |
# everything with 99% |
104 |
return unless $s =~ /\p{Word}/ && $s =~ /\P{Word}/ || length($s)==1 && $s =~ /\P{Word}/; |
105 |
warn "OR_mixed: $s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE(); |
106 |
$s; |
107 |
} |
108 |
|
109 |
sub WAIT::Filter::OR_isbn_20020127 { |
110 |
my $s = shift; |
111 |
# we would like to be strict when indexing and lax when reading a |
112 |
# query but we are only a filtering function and know nothing about |
113 |
# the outside world, so we do not know if we are called by a search |
114 |
# or during indexing. So we cannot do this: |
115 |
if (0) { # ideal for indexing, too intolerant when querying |
116 |
return unless $s =~ /^\d+-\d+-\d+-[\dx]/i; |
117 |
return unless length($s)==13; |
118 |
} |
119 |
return unless $s =~ /^[\d\-x]+$/i; |
120 |
my $s2 = $s = uc $s; |
121 |
my @s; |
122 |
if ($s2 =~ s/-//g) { |
123 |
@s = ($s,$s2); |
124 |
} else { |
125 |
@s = $s; |
126 |
} |
127 |
warn " OR_isbn: @s\n" if oreilly_de_catalog::config::ULTRA_VERBOSE(); |
128 |
@s; |
129 |
} |
130 |
|
131 |
sub WAIT::Filter::OR_trigrams_20020125 { |
132 |
my $string = shift; |
133 |
my @result; |
134 |
my $start; |
135 |
|
136 |
my $end = length($string) - 2; |
137 |
for ($start=0; $start<$end; $start++) { |
138 |
my $s = substr $string, $start, 3; |
139 |
push @result, $s; |
140 |
} |
141 |
@result; |
142 |
} |
143 |
|
144 |
sub WAIT::Filter::OR_split_20020401 { |
145 |
# just cloning split from WAIT::Filter. Need debug more carefully |
146 |
my(@args) = @_; |
147 |
# warn sprintf("argsN[%s]argsV[%s]", scalar @args, join(":",@args)); |
148 |
# use Devel::Peek; |
149 |
# Devel::Peek::Dump($_) for @args; |
150 |
map CORE::split(" ", $_), @args; |
151 |
} |
152 |
|
153 |
1; |