1 |
dpavlin |
390 |
package WebPAC::jsFind; |
2 |
|
|
|
3 |
|
|
use warnings; |
4 |
|
|
use strict; |
5 |
|
|
|
6 |
|
|
use Carp; |
7 |
dpavlin |
540 |
use jsFind 0.06; |
8 |
dpavlin |
390 |
use Log::Log4perl qw(get_logger :levels); |
9 |
|
|
|
10 |
|
|
=head1 NAME |
11 |
|
|
|
12 |
|
|
WebPAC::jsFind - create jsFind index instead of swish-e |
13 |
|
|
|
14 |
|
|
=head1 DESCRIPTION |
15 |
|
|
|
16 |
|
|
This module will create jsFind index, which is static B-Tree index |
17 |
|
|
searchable by JavaScript. It's very useful if you want to build |
18 |
|
|
CD-ROM with static content and search engine. |
19 |
|
|
|
20 |
|
|
=head1 METHODS |
21 |
|
|
|
22 |
|
|
=head2 new |
23 |
|
|
|
24 |
|
|
Create new index object |
25 |
|
|
|
26 |
|
|
my $index = new WebPAC::jsFind( |
27 |
|
|
index_path => '/path/to/jsFind/index', |
28 |
|
|
keys => 10, |
29 |
dpavlin |
409 |
log => 'log4perl.conf', |
30 |
dpavlin |
390 |
); |
31 |
|
|
|
32 |
dpavlin |
706 |
C<index_path> is path to location where jsFind index should be created. |
33 |
dpavlin |
390 |
|
34 |
|
|
C<keys> is optional parametar which specify number of keys in each node |
35 |
|
|
(which has to be even number). Default is 10. |
36 |
|
|
|
37 |
dpavlin |
409 |
C<log> is optional parametar which specify filename of L<Log::Log4Perl> |
38 |
|
|
config file. Default is C<log.conf>. |
39 |
|
|
|
40 |
dpavlin |
390 |
=cut |
41 |
|
|
|
42 |
|
|
sub new { |
43 |
|
|
my $class = shift; |
44 |
|
|
my $self = {@_}; |
45 |
|
|
bless($self, $class); |
46 |
|
|
|
47 |
|
|
confess "need index_path argument!" unless ($self->{'index_path'}); |
48 |
|
|
|
49 |
|
|
my $log_file = $self->{'log'} || "log.conf"; |
50 |
|
|
Log::Log4perl->init($log_file); |
51 |
|
|
|
52 |
|
|
return $self; |
53 |
|
|
} |
54 |
|
|
|
55 |
|
|
=head2 tree |
56 |
|
|
|
57 |
|
|
Create or retreive jsFind tree object |
58 |
|
|
|
59 |
|
|
$index->tree('index_name'); |
60 |
|
|
|
61 |
|
|
=cut |
62 |
|
|
|
63 |
|
|
sub tree { |
64 |
|
|
my $self = shift; |
65 |
|
|
|
66 |
|
|
my $index_name = shift || confess "need index name!"; |
67 |
|
|
|
68 |
|
|
if (! $self->{'tree'}->{$index_name}) { |
69 |
|
|
$self->{'tree'}->{$index_name} = new jsFind(B => $self->{keys} || 10); |
70 |
|
|
my $log = $self->_get_logger(); |
71 |
|
|
$log->debug("tree object $index_name created"); |
72 |
|
|
|
73 |
|
|
} |
74 |
|
|
|
75 |
|
|
return $self->{'tree'}->{$index_name}; |
76 |
|
|
|
77 |
|
|
} |
78 |
|
|
|
79 |
|
|
=head2 insert |
80 |
|
|
|
81 |
|
|
Insert data into index |
82 |
|
|
|
83 |
|
|
$index->insert( |
84 |
|
|
index_name => 'index_name', |
85 |
|
|
path => 'path', |
86 |
|
|
headline => 'headline text', |
87 |
|
|
words => 'words to insert into index' |
88 |
|
|
); |
89 |
|
|
|
90 |
|
|
=cut |
91 |
|
|
|
92 |
|
|
sub insert { |
93 |
|
|
my $self = shift; |
94 |
|
|
|
95 |
|
|
my $args = {@_}; |
96 |
|
|
|
97 |
|
|
my $log = $self->_get_logger(); |
98 |
|
|
|
99 |
|
|
confess "need index name" unless ($args->{'index_name'}); |
100 |
|
|
confess "need path" unless ($args->{'path'}); |
101 |
|
|
if (! $args->{'headline'}) { |
102 |
|
|
carp "no headline for ",$args->{'path'}," ?"; |
103 |
|
|
$args->{'headline'} = "no headline: ".$args->{'path'}; |
104 |
|
|
} |
105 |
dpavlin |
588 |
if (! defined($args->{'words'})) { |
106 |
|
|
$log->warn("no words to insert for headline ",$args->{'headline'}); |
107 |
|
|
return; |
108 |
|
|
} |
109 |
dpavlin |
390 |
|
110 |
dpavlin |
491 |
my $words = lc($args->{'words'}); |
111 |
dpavlin |
390 |
|
112 |
|
|
# chop leading and trailing spaces |
113 |
|
|
$words =~ s/^\s+//; |
114 |
|
|
$words =~ s/\s+$//; |
115 |
|
|
|
116 |
dpavlin |
491 |
my @words = split(/\s+/,$words); |
117 |
|
|
|
118 |
dpavlin |
390 |
my %usage; |
119 |
dpavlin |
491 |
foreach (@words) { |
120 |
dpavlin |
390 |
$usage{$_}++; |
121 |
|
|
} |
122 |
|
|
|
123 |
|
|
$log->debug("inserting '$words'", |
124 |
|
|
" into index ",$args->{'index_name'}, |
125 |
|
|
" headline: ",$args->{'headline'}, |
126 |
|
|
" path: ",$args->{'path'} |
127 |
|
|
); |
128 |
|
|
|
129 |
dpavlin |
491 |
foreach my $word (@words) { |
130 |
dpavlin |
390 |
|
131 |
|
|
$self->tree($args->{'index_name'})->B_search( |
132 |
|
|
Key => $word, |
133 |
|
|
Data => { $args->{'path'} => { |
134 |
|
|
t => $args->{'headline'}, |
135 |
|
|
f => $usage{$word}, |
136 |
|
|
}, |
137 |
|
|
}, |
138 |
|
|
Insert => 1, |
139 |
|
|
Append => 1, |
140 |
|
|
); |
141 |
|
|
} |
142 |
|
|
} |
143 |
|
|
|
144 |
|
|
=head2 close |
145 |
|
|
|
146 |
|
|
This method will dump indexes to disk. |
147 |
|
|
|
148 |
|
|
$index->close; |
149 |
|
|
|
150 |
|
|
This method will create directories if needed and store tree xml files |
151 |
|
|
for all indexes. |
152 |
|
|
|
153 |
dpavlin |
445 |
Turning debugging for this function by inserting |
154 |
dpavlin |
390 |
|
155 |
dpavlin |
445 |
log4perl.logger.WebPAC.jsFind.close=DEBUG |
156 |
|
|
|
157 |
|
|
into C<log.conf> will also result in creation of GraphViz C<.dot> files |
158 |
|
|
for each index in current directory. |
159 |
|
|
|
160 |
dpavlin |
390 |
=cut |
161 |
|
|
|
162 |
|
|
sub close { |
163 |
|
|
my $self = shift; |
164 |
|
|
|
165 |
|
|
my $log = $self->_get_logger(); |
166 |
|
|
|
167 |
|
|
foreach my $index_name (keys %{$self->{'tree'}}) { |
168 |
|
|
my $path = $self->{'index_path'}."/".$index_name; |
169 |
|
|
|
170 |
|
|
$log->debug("saving index '$index_name' xml files to '$path'"); |
171 |
|
|
|
172 |
dpavlin |
540 |
$self->tree($index_name)->to_jsfind( |
173 |
|
|
dir => $path, |
174 |
|
|
data_codepage => 'ISO-8859-2', |
175 |
|
|
index_codepage => 'UTF-8' |
176 |
|
|
); |
177 |
dpavlin |
445 |
|
178 |
|
|
if ($log->is_debug()) { |
179 |
|
|
my $dot_file = $index_name.".dot"; |
180 |
|
|
|
181 |
|
|
$log->debug("saving graphviz file for '$index_name' to '$dot_file'"); |
182 |
|
|
|
183 |
|
|
open(DOT, ">", $dot_file) || $log->logdie("can't open '$dot_file': $!"); |
184 |
|
|
print DOT $self->tree($index_name)->to_dot; |
185 |
|
|
close(DOT); |
186 |
|
|
} |
187 |
dpavlin |
390 |
} |
188 |
|
|
|
189 |
|
|
} |
190 |
|
|
|
191 |
|
|
# |
192 |
|
|
|
193 |
|
|
=head1 INTERNAL METHODS |
194 |
|
|
|
195 |
|
|
You shouldn't call this methods directly. |
196 |
|
|
|
197 |
|
|
=head2 _get_logger |
198 |
|
|
|
199 |
|
|
Get C<Log::Log4perl> object with a twist: domains are defined for each |
200 |
|
|
method |
201 |
|
|
|
202 |
|
|
my $log = $webpac->_get_logger(); |
203 |
|
|
|
204 |
|
|
=cut |
205 |
|
|
|
206 |
|
|
sub _get_logger { |
207 |
|
|
my $self = shift; |
208 |
|
|
|
209 |
|
|
my $name = (caller(1))[3] || caller; |
210 |
|
|
return get_logger($name); |
211 |
|
|
} |
212 |
|
|
1; |