| 1 |
1 |
dpavlin |
#!/usr/bin/perl -w |
| 2 |
|
|
|
| 3 |
|
|
use strict; |
| 4 |
760 |
dpavlin |
use Biblio::Isis 0.23; |
| 5 |
1 |
dpavlin |
use Getopt::Std; |
| 6 |
|
|
use Data::Dumper; |
| 7 |
|
|
use XML::Simple; |
| 8 |
10 |
dpavlin |
use Text::Iconv; |
| 9 |
13 |
dpavlin |
use Config::IniFiles; |
| 10 |
40 |
dpavlin |
use Encode; |
| 11 |
776 |
dpavlin |
use GDBM_File; |
| 12 |
177 |
dpavlin |
use Fcntl; # for O_RDWR |
| 13 |
776 |
dpavlin |
#use TDB_File; |
| 14 |
684 |
dpavlin |
use Carp; |
| 15 |
1 |
dpavlin |
|
| 16 |
10 |
dpavlin |
$|=1; |
| 17 |
9 |
dpavlin |
|
| 18 |
13 |
dpavlin |
my $config_file = $0; |
| 19 |
|
|
$config_file =~ s/\.pl$/.conf/; |
| 20 |
320 |
dpavlin |
$config_file = $ARGV[0] if ($ARGV[0] && -f $ARGV[0]); |
| 21 |
13 |
dpavlin |
die "FATAL: can't find configuration file '$config_file'" if (! -e $config_file); |
| 22 |
|
|
|
| 23 |
10 |
dpavlin |
my $config; |
| 24 |
|
|
|
| 25 |
58 |
dpavlin |
#use index_DBI; # default DBI module for index |
| 26 |
652 |
dpavlin |
#use index_DBI_cache; # faster DBI module using memory cache |
| 27 |
|
|
use index_DBI_filter; # filter support for indexes |
| 28 |
50 |
dpavlin |
my $index; |
| 29 |
10 |
dpavlin |
|
| 30 |
1 |
dpavlin |
my %opts; |
| 31 |
|
|
|
| 32 |
7 |
dpavlin |
# usage: |
| 33 |
|
|
# -d directory name |
| 34 |
|
|
# -m multiple directories |
| 35 |
|
|
# -q quiet |
| 36 |
|
|
# -s run swish |
| 37 |
1 |
dpavlin |
|
| 38 |
7 |
dpavlin |
getopts('d:m:qs', \%opts); |
| 39 |
|
|
|
| 40 |
43 |
dpavlin |
my $path; # this is name of database |
| 41 |
1 |
dpavlin |
|
| 42 |
57 |
dpavlin |
Text::Iconv->raise_error(0); # Conversion errors don't raise exceptions |
| 43 |
3 |
dpavlin |
|
| 44 |
40 |
dpavlin |
# this is encoding of all files on disk, including import_xml/*.xml file and |
| 45 |
|
|
# filter/*.pm files! It will be used to store strings in perl internally! |
| 46 |
|
|
my $codepage = 'ISO-8859-2'; |
| 47 |
10 |
dpavlin |
|
| 48 |
40 |
dpavlin |
my $utf2cp = Text::Iconv->new('UTF-8',$codepage); |
| 49 |
|
|
# this function will convert data from XML files to local encoding |
| 50 |
|
|
sub x { |
| 51 |
|
|
return $utf2cp->convert($_[0]); |
| 52 |
|
|
} |
| 53 |
29 |
dpavlin |
|
| 54 |
54 |
dpavlin |
# decode isis/excel or other import codepage |
| 55 |
|
|
my $import2cp; |
| 56 |
29 |
dpavlin |
|
| 57 |
40 |
dpavlin |
# outgoing xml must be in UTF-8 |
| 58 |
|
|
my $cp2utf = Text::Iconv->new($codepage,'UTF-8'); |
| 59 |
|
|
|
| 60 |
54 |
dpavlin |
# mapping between data type and tag which specify |
| 61 |
|
|
# format in XML file |
| 62 |
|
|
my %type2tag = ( |
| 63 |
|
|
'isis' => 'isis', |
| 64 |
62 |
dpavlin |
'excel' => 'column', |
| 65 |
|
|
'marc' => 'marc', |
| 66 |
684 |
dpavlin |
'feed' => 'feed', |
| 67 |
54 |
dpavlin |
); |
| 68 |
3 |
dpavlin |
|
| 69 |
170 |
dpavlin |
my $cache; # for cacheing |
| 70 |
|
|
|
| 71 |
177 |
dpavlin |
# lookup hash (tied to file) |
| 72 |
|
|
my %lhash; |
| 73 |
|
|
# this option will cache all lookup entries in memory. |
| 74 |
|
|
# if you are tight on memory, turn this off |
| 75 |
|
|
my $use_lhash_cache = 1; |
| 76 |
|
|
|
| 77 |
195 |
dpavlin |
my $last_field_name; # cache to prevent repeated fields |
| 78 |
|
|
|
| 79 |
748 |
dpavlin |
my $broken_cdata = XMLin('<foo><![CDATA[<bar>]]></foo>') eq '<bar>>'; |
| 80 |
|
|
warn "XML::Simple on this system seems broken with <![CDATA[..]]>.\n" if ($broken_cdata); |
| 81 |
|
|
|
| 82 |
54 |
dpavlin |
sub data2xml { |
| 83 |
|
|
|
| 84 |
10 |
dpavlin |
use xmlify; |
| 85 |
|
|
|
| 86 |
54 |
dpavlin |
my $type = shift @_; |
| 87 |
3 |
dpavlin |
my $row = shift @_; |
| 88 |
13 |
dpavlin |
my $add_xml = shift @_; |
| 89 |
59 |
dpavlin |
# needed to read values from configuration file |
| 90 |
|
|
my $cfg = shift @_; |
| 91 |
|
|
my $database = shift @_; |
| 92 |
3 |
dpavlin |
|
| 93 |
|
|
my $xml; |
| 94 |
|
|
|
| 95 |
10 |
dpavlin |
use parse_format; |
| 96 |
3 |
dpavlin |
|
| 97 |
13 |
dpavlin |
my $html = ""; # html formatted display output |
| 98 |
10 |
dpavlin |
|
| 99 |
13 |
dpavlin |
my %field_usage; # counter for usage of each field |
| 100 |
|
|
|
| 101 |
32 |
dpavlin |
# sort subrouting using order="" attribute |
| 102 |
|
|
sub by_order { |
| 103 |
98 |
dpavlin |
my $va = $config->{indexer}->{$a}->{order} || |
| 104 |
|
|
$config->{indexer}->{$a}; |
| 105 |
|
|
my $vb = $config->{indexer}->{$b}->{order} || |
| 106 |
|
|
$config->{indexer}->{$b}; |
| 107 |
29 |
dpavlin |
|
| 108 |
98 |
dpavlin |
return $va <=> $vb; |
| 109 |
32 |
dpavlin |
} |
| 110 |
3 |
dpavlin |
|
| 111 |
170 |
dpavlin |
my @sorted_tags; |
| 112 |
180 |
dpavlin |
if ($cache->{tags_by_order}) { |
| 113 |
|
|
@sorted_tags = @{$cache->{tags_by_order}}; |
| 114 |
170 |
dpavlin |
} else { |
| 115 |
|
|
@sorted_tags = sort by_order keys %{$config->{indexer}}; |
| 116 |
180 |
dpavlin |
$cache->{tags_by_order} = \@sorted_tags; |
| 117 |
170 |
dpavlin |
} |
| 118 |
32 |
dpavlin |
|
| 119 |
684 |
dpavlin |
if (! @sorted_tags) { |
| 120 |
|
|
print STDERR "WARNING: no tags for this type found in import_xml file!\n"; |
| 121 |
|
|
} |
| 122 |
|
|
|
| 123 |
177 |
dpavlin |
# lookup key |
| 124 |
|
|
my $lookup_key; |
| 125 |
|
|
|
| 126 |
178 |
dpavlin |
# cache for field in pages |
| 127 |
180 |
dpavlin |
delete $cache->{display_data}; |
| 128 |
|
|
delete $cache->{swish_data}; |
| 129 |
|
|
delete $cache->{swish_exact_data}; |
| 130 |
182 |
dpavlin |
delete $cache->{index_data}; |
| 131 |
188 |
dpavlin |
delete $cache->{index_delimiter}; |
| 132 |
731 |
dpavlin |
delete $cache->{distinct}; |
| 133 |
178 |
dpavlin |
my @page_fields; # names of fields |
| 134 |
|
|
|
| 135 |
|
|
|
| 136 |
|
|
# subs used to produce output |
| 137 |
|
|
|
| 138 |
|
|
sub get_field_name($$$) { |
| 139 |
|
|
my ($config,$field,$field_usage) = @_; |
| 140 |
|
|
|
| 141 |
|
|
# find field name (signular, plural) |
| 142 |
|
|
my $field_name = ""; |
| 143 |
|
|
if ($config->{indexer}->{$field}->{name_singular} && $field_usage == 1) { |
| 144 |
|
|
$field_name = $config->{indexer}->{$field}->{name_singular}; |
| 145 |
|
|
} elsif ($config->{indexer}->{$field}->{name_plural}) { |
| 146 |
|
|
$field_name = $config->{indexer}->{$field}->{name_plural}; |
| 147 |
|
|
} elsif ($config->{indexer}->{$field}->{name}) { |
| 148 |
|
|
$field_name = $config->{indexer}->{$field}->{name}; |
| 149 |
|
|
} else { |
| 150 |
|
|
print STDERR "WARNING: field '$field' doesn't have 'name' attribute!"; |
| 151 |
|
|
} |
| 152 |
601 |
dpavlin |
|
| 153 |
178 |
dpavlin |
if ($field_name) { |
| 154 |
601 |
dpavlin |
$field_name = x($field_name); |
| 155 |
195 |
dpavlin |
if (! $last_field_name) { |
| 156 |
601 |
dpavlin |
$last_field_name = $field_name; |
| 157 |
195 |
dpavlin |
return $last_field_name; |
| 158 |
|
|
} elsif ($field_name ne $last_field_name) { |
| 159 |
601 |
dpavlin |
$last_field_name = $field_name; |
| 160 |
195 |
dpavlin |
return $last_field_name; |
| 161 |
|
|
} |
| 162 |
178 |
dpavlin |
} |
| 163 |
|
|
} |
| 164 |
|
|
|
| 165 |
279 |
dpavlin |
|
| 166 |
|
|
# init variables for different types |
| 167 |
274 |
dpavlin |
sub init_visible_type($) { |
| 168 |
|
|
my $type = shift; |
| 169 |
178 |
dpavlin |
|
| 170 |
274 |
dpavlin |
# swish, swish_exact, display, index, index_lookup |
| 171 |
|
|
# swish and display defaults |
| 172 |
|
|
my ($s,$se,$d,$i,$il) = (1,0,1,0,0); |
| 173 |
|
|
if (lc($type) eq "display") { |
| 174 |
|
|
$s = 0; |
| 175 |
|
|
} elsif (lc($type) eq "swish") { |
| 176 |
|
|
$d = 0; |
| 177 |
|
|
} elsif (lc($type) eq "index") { |
| 178 |
|
|
($s,$se,$d,$i) = (0,1,0,1); |
| 179 |
|
|
} elsif (lc($type) eq "swish_exact") { |
| 180 |
|
|
($s,$se,$d,$i) = (0,1,0,0); |
| 181 |
|
|
} elsif (lc($type) =~ /^lookup/) { |
| 182 |
|
|
($s,$se,$d,$i,$il) = (0,1,0,0,1); |
| 183 |
332 |
dpavlin |
} elsif ($type) { |
| 184 |
|
|
print STDERR "WARNING: unknown type: $type\n"; |
| 185 |
274 |
dpavlin |
} |
| 186 |
|
|
return ($s,$se,$d,$i,$il); |
| 187 |
|
|
} |
| 188 |
|
|
|
| 189 |
279 |
dpavlin |
|
| 190 |
|
|
# convert |
| 191 |
|
|
# |
| 192 |
|
|
# <tag> |
| 193 |
|
|
# <delimiter>, </delimiter> |
| 194 |
|
|
# <value>200a</value> |
| 195 |
|
|
# </tag> |
| 196 |
|
|
# |
| 197 |
|
|
# to |
| 198 |
|
|
# |
| 199 |
|
|
# <tag delimiter=", ">200a</tag> |
| 200 |
|
|
# |
| 201 |
|
|
# but without loosing spaces in delimiter (becasue |
| 202 |
|
|
# new XML::Simple strips spaces in attribute values |
| 203 |
|
|
# as defined in XML specification) |
| 204 |
|
|
# |
| 205 |
|
|
sub unroll_x($) { |
| 206 |
|
|
my $x = shift; |
| 207 |
|
|
|
| 208 |
|
|
if (defined $x->{value}) { |
| 209 |
|
|
my ($v,$d) = ($x->{value}->{content}, $x->{delimiter}->{content}); |
| 210 |
|
|
delete $x->{value}; |
| 211 |
|
|
delete $x->{delimiter}; |
| 212 |
|
|
$x->{content} = $v; |
| 213 |
748 |
dpavlin |
$d =~ s#>$## if ($d && $broken_cdata); |
| 214 |
279 |
dpavlin |
$x->{delimiter} = $d; |
| 215 |
|
|
} |
| 216 |
|
|
return $x; |
| 217 |
|
|
} |
| 218 |
|
|
|
| 219 |
178 |
dpavlin |
# begin real work: go field by field |
| 220 |
170 |
dpavlin |
foreach my $field (@sorted_tags) { |
| 221 |
|
|
|
| 222 |
40 |
dpavlin |
$field=x($field); |
| 223 |
13 |
dpavlin |
$field_usage{$field}++; |
| 224 |
|
|
|
| 225 |
9 |
dpavlin |
my $swish_data = ""; |
| 226 |
163 |
dpavlin |
my $swish_exact_data = ""; |
| 227 |
10 |
dpavlin |
my $display_data = ""; |
| 228 |
182 |
dpavlin |
my @index_data; |
| 229 |
35 |
dpavlin |
my $line_delimiter; |
| 230 |
3 |
dpavlin |
|
| 231 |
34 |
dpavlin |
my ($swish,$display); |
| 232 |
|
|
|
| 233 |
684 |
dpavlin |
my $tag = $cfg->val($database, 'import_xml_tag') || $type2tag{$type} || die "can't find which tag to use for type $type"; |
| 234 |
178 |
dpavlin |
|
| 235 |
|
|
# is this field page-by-page? |
| 236 |
|
|
my $iterate_by_page = $config->{indexer}->{$field}->{iterate_by_page}; |
| 237 |
|
|
push @page_fields,$field if ($iterate_by_page); |
| 238 |
|
|
my %page_max = (); |
| 239 |
|
|
# default line_delimiter if using |
| 240 |
|
|
my $page_line_delimiter = $config->{indexer}->{$field}->{page_line_delimiter} || '<br/>'; |
| 241 |
188 |
dpavlin |
$cache->{index_delimiter}->{$field} = $config->{indexer}->{$field}->{index_delimiter}; |
| 242 |
731 |
dpavlin |
my $distinct = $config->{indexer}->{$field}->{distinct}; |
| 243 |
|
|
if ($distinct && !$iterate_by_page) { |
| 244 |
|
|
warn "WARNING: distinct is currently not supported without iterate_by_page!\n"; |
| 245 |
|
|
$distinct = 0; |
| 246 |
|
|
} |
| 247 |
178 |
dpavlin |
|
| 248 |
195 |
dpavlin |
my $format_name = $config->{indexer}->{$field}->{format_name}; |
| 249 |
|
|
my $format_delimiter = $config->{indexer}->{$field}->{format_delimiter}; |
| 250 |
|
|
if ($format_name && $format_delimiter) { |
| 251 |
|
|
$cache->{format}->{$field}->{format_name} = $format_name; |
| 252 |
|
|
$cache->{format}->{$field}->{format_delimiter} = $format_delimiter; |
| 253 |
|
|
} |
| 254 |
|
|
|
| 255 |
54 |
dpavlin |
foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) { |
| 256 |
3 |
dpavlin |
|
| 257 |
279 |
dpavlin |
$x = unroll_x($x); |
| 258 |
|
|
|
| 259 |
40 |
dpavlin |
my $format = x($x->{content}); |
| 260 |
|
|
my $delimiter = x($x->{delimiter}) || ' '; |
| 261 |
3 |
dpavlin |
|
| 262 |
178 |
dpavlin |
my $repeat_off = 0; # init repeatable offset |
| 263 |
29 |
dpavlin |
|
| 264 |
274 |
dpavlin |
my ($s,$se,$d,$i,$il) = init_visible_type($x->{type}); |
| 265 |
29 |
dpavlin |
|
| 266 |
35 |
dpavlin |
# what will separate last line from this one? |
| 267 |
182 |
dpavlin |
if ($display_data && $x->{append}) { |
| 268 |
290 |
dpavlin |
$line_delimiter = $delimiter; |
| 269 |
35 |
dpavlin |
} elsif ($display_data) { |
| 270 |
34 |
dpavlin |
$line_delimiter = '<br/>'; |
| 271 |
|
|
} |
| 272 |
29 |
dpavlin |
|
| 273 |
34 |
dpavlin |
# init vars so that we go into while... |
| 274 |
|
|
($swish,$display) = (1,1); |
| 275 |
29 |
dpavlin |
|
| 276 |
195 |
dpavlin |
sub mkformat($$) { |
| 277 |
153 |
dpavlin |
my $x = shift || die "mkformat needs tag reference"; |
| 278 |
|
|
my $data = shift || return; |
| 279 |
263 |
dpavlin |
my $format_name = x($x->{format_name}) || return $data; |
| 280 |
153 |
dpavlin |
my $fmt = x($config->{format}->{$format_name}->{content}) || die "<format name=\"$format_name\"> is not defined!"; |
| 281 |
|
|
my $format_delimiter = x($x->{format_delimiter}); |
| 282 |
|
|
my @data; |
| 283 |
|
|
if ($format_delimiter) { |
| 284 |
|
|
@data = split(/$format_delimiter/,$data); |
| 285 |
|
|
} else { |
| 286 |
|
|
push @data,$data; |
| 287 |
|
|
} |
| 288 |
|
|
|
| 289 |
|
|
if ($fmt) { |
| 290 |
|
|
my $nr = scalar $fmt =~ s/%s/%s/g; |
| 291 |
|
|
if (($#data+1) == $nr) { |
| 292 |
263 |
dpavlin |
return sprintf($fmt,@data); |
| 293 |
153 |
dpavlin |
} else { |
| 294 |
207 |
dpavlin |
#print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n"; |
| 295 |
263 |
dpavlin |
return $data; |
| 296 |
153 |
dpavlin |
} |
| 297 |
|
|
} else { |
| 298 |
|
|
print STDERR "usage of link '$format_name' without defined format (<link> tag)\n"; |
| 299 |
|
|
} |
| 300 |
|
|
} |
| 301 |
|
|
|
| 302 |
90 |
dpavlin |
# while because of repeatable fields |
| 303 |
|
|
while ($swish || $display) { |
| 304 |
178 |
dpavlin |
my $page = $repeat_off; |
| 305 |
|
|
$page_max{$field} = $page if ($iterate_by_page && $page > ($page_max{$field} || 0)); |
| 306 |
54 |
dpavlin |
($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp); |
| 307 |
90 |
dpavlin |
if ($repeat_off > 1000) { |
| 308 |
|
|
print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n"; |
| 309 |
|
|
last; |
| 310 |
|
|
} |
| 311 |
177 |
dpavlin |
|
| 312 |
|
|
# is this field is lookup? |
| 313 |
|
|
if ($display && $x->{lookup}) { |
| 314 |
178 |
dpavlin |
my $null = "<!-- null -->"; |
| 315 |
177 |
dpavlin |
if ($use_lhash_cache) { |
| 316 |
|
|
if (!defined($cache->{lhash}->{$display})) { |
| 317 |
|
|
my $new_display = $lhash{$display}; |
| 318 |
178 |
dpavlin |
if (defined($new_display)) { |
| 319 |
177 |
dpavlin |
#print STDERR "lookup cache store '$display' = '$new_display'\n"; |
| 320 |
|
|
$display = $new_display; |
| 321 |
|
|
$cache->{lhash}->{$display} = $new_display; |
| 322 |
|
|
} else { |
| 323 |
255 |
dpavlin |
# print STDERR "WARNING: lookup for '$display' didn't find anything.\n"; |
| 324 |
177 |
dpavlin |
$display = ""; |
| 325 |
178 |
dpavlin |
$cache->{lhash}->{$display} = $null; |
| 326 |
177 |
dpavlin |
} |
| 327 |
|
|
} else { |
| 328 |
|
|
$display = $cache->{lhash}->{$display}; |
| 329 |
|
|
} |
| 330 |
|
|
} else { |
| 331 |
178 |
dpavlin |
$display = $lhash{$display} || $null; |
| 332 |
177 |
dpavlin |
} |
| 333 |
|
|
} |
| 334 |
|
|
|
| 335 |
29 |
dpavlin |
# filter="name" ; filter this field through |
| 336 |
|
|
# filter/[name].pm |
| 337 |
|
|
my $filter = $x->{filter}; |
| 338 |
170 |
dpavlin |
if ($filter && !$cache->{filter_loaded}->{$filter}) { |
| 339 |
29 |
dpavlin |
require "filter/".$filter.".pm"; |
| 340 |
170 |
dpavlin |
$cache->{filter_loaded}->{$filter}++; |
| 341 |
20 |
dpavlin |
} |
| 342 |
29 |
dpavlin |
# type="swish" ; field for swish |
| 343 |
163 |
dpavlin |
if ($swish) { |
| 344 |
259 |
dpavlin |
my $tmp = $swish; |
| 345 |
163 |
dpavlin |
if ($filter && ($s || $se)) { |
| 346 |
29 |
dpavlin |
no strict 'refs'; |
| 347 |
259 |
dpavlin |
$tmp = join(" ",&$filter($tmp)) if ($s || $se); |
| 348 |
29 |
dpavlin |
} |
| 349 |
234 |
dpavlin |
|
| 350 |
255 |
dpavlin |
$swish_data .= $tmp if ($s && $tmp); |
| 351 |
256 |
dpavlin |
$swish_exact_data .= "xxbxx $tmp xxexx " if ($tmp && $tmp ne "" && $se); |
| 352 |
29 |
dpavlin |
} |
| 353 |
17 |
dpavlin |
|
| 354 |
29 |
dpavlin |
# type="display" ; field for display |
| 355 |
|
|
if ($d && $display) { |
| 356 |
207 |
dpavlin |
my $ldel = $delimiter; |
| 357 |
35 |
dpavlin |
if ($line_delimiter && $display_data) { |
| 358 |
207 |
dpavlin |
$ldel = $line_delimiter; |
| 359 |
35 |
dpavlin |
} |
| 360 |
29 |
dpavlin |
if ($filter) { |
| 361 |
|
|
no strict 'refs'; |
| 362 |
207 |
dpavlin |
my @arr; |
| 363 |
|
|
foreach my $tmp (&$filter($display)) { |
| 364 |
|
|
my $tmp2 = mkformat($x,$tmp); |
| 365 |
|
|
push @arr,$tmp2 if ($tmp2); |
| 366 |
138 |
dpavlin |
} |
| 367 |
207 |
dpavlin |
$display_data .= $ldel if ($display_data && @arr); |
| 368 |
|
|
$display_data .= join($delimiter,@arr); |
| 369 |
29 |
dpavlin |
} else { |
| 370 |
207 |
dpavlin |
$display_data .= $ldel if ($display_data); |
| 371 |
|
|
my $tmp = mkformat($x,$display); |
| 372 |
224 |
dpavlin |
$display_data .= $tmp if ($tmp); |
| 373 |
29 |
dpavlin |
} |
| 374 |
20 |
dpavlin |
} |
| 375 |
29 |
dpavlin |
|
| 376 |
|
|
# type="index" ; insert into index |
| 377 |
182 |
dpavlin |
my $idisplay; |
| 378 |
29 |
dpavlin |
if ($i && $display) { |
| 379 |
182 |
dpavlin |
$idisplay = $display; |
| 380 |
177 |
dpavlin |
if ($filter) { |
| 381 |
|
|
no strict 'refs'; |
| 382 |
182 |
dpavlin |
$idisplay = &$filter($idisplay); |
| 383 |
177 |
dpavlin |
} |
| 384 |
255 |
dpavlin |
push @index_data, $idisplay if ($idisplay && !$iterate_by_page); |
| 385 |
97 |
dpavlin |
} |
| 386 |
|
|
|
| 387 |
177 |
dpavlin |
# store fields in lookup |
| 388 |
|
|
if ($il && $display) { |
| 389 |
|
|
if (lc($x->{type}) eq "lookup_key") { |
| 390 |
|
|
if ($lookup_key) { |
| 391 |
|
|
print STDERR "WARNING: try to redefine lookup_key (keys shouldn't be repeatable fields!)"; |
| 392 |
|
|
} else { |
| 393 |
233 |
dpavlin |
if ($filter) { |
| 394 |
|
|
no strict 'refs'; |
| 395 |
|
|
$lookup_key = &$filter($display); |
| 396 |
|
|
} else { |
| 397 |
|
|
$lookup_key = $display; |
| 398 |
|
|
} |
| 399 |
177 |
dpavlin |
} |
| 400 |
|
|
} elsif (lc($x->{type}) eq "lookup_val") { |
| 401 |
|
|
if ($lookup_key) { |
| 402 |
233 |
dpavlin |
if ($filter) { |
| 403 |
|
|
no strict 'refs'; |
| 404 |
|
|
$lhash{$lookup_key} = &$filter($display); |
| 405 |
|
|
} else { |
| 406 |
|
|
$lhash{$lookup_key} = $display; |
| 407 |
|
|
} |
| 408 |
177 |
dpavlin |
} else { |
| 409 |
|
|
print STDERR "WARNING: no lookup_key defined for '$display'?"; |
| 410 |
|
|
} |
| 411 |
20 |
dpavlin |
} |
| 412 |
178 |
dpavlin |
|
| 413 |
20 |
dpavlin |
} |
| 414 |
178 |
dpavlin |
|
| 415 |
|
|
# store data for page-by-page repeatable fields |
| 416 |
|
|
if ($iterate_by_page) { |
| 417 |
|
|
sub iterate_fld($$$$$$) { |
| 418 |
|
|
my ($cache,$what,$field,$page,$data,$append) = @_; |
| 419 |
|
|
return if (!$data); |
| 420 |
182 |
dpavlin |
|
| 421 |
|
|
my $ldel = $page_line_delimiter; |
| 422 |
|
|
$ldel = " " if ($append); |
| 423 |
|
|
#print STDERR "line delimiter: ",Dumper($ldel) if ($ldel); |
| 424 |
178 |
dpavlin |
if (! $cache->{$what}->{$field}->[$page]) { |
| 425 |
731 |
dpavlin |
push @{$cache->{$what}->{$field}->[$page]}, { |
| 426 |
|
|
data => $data, |
| 427 |
|
|
delimiter => $ldel, |
| 428 |
|
|
}; |
| 429 |
178 |
dpavlin |
} |
| 430 |
|
|
} |
| 431 |
|
|
|
| 432 |
|
|
if ($display_data) { |
| 433 |
|
|
iterate_fld($cache,'display_data',$field,$page,$display_data,$x->{append}); |
| 434 |
|
|
} |
| 435 |
|
|
$display_data = ""; |
| 436 |
|
|
if ($swish_data) { |
| 437 |
|
|
iterate_fld($cache,'swish_data',$field,$page,$swish_data,$x->{append}); |
| 438 |
|
|
$swish_data = ""; |
| 439 |
|
|
} |
| 440 |
|
|
if ($swish_exact_data) { |
| 441 |
|
|
iterate_fld($cache,'swish_exact_data',$field,$page,$swish_exact_data,$x->{append}); |
| 442 |
|
|
$swish_exact_data = ""; |
| 443 |
|
|
} |
| 444 |
182 |
dpavlin |
|
| 445 |
|
|
if ($idisplay) { |
| 446 |
|
|
my $ldel=$page_line_delimiter; |
| 447 |
|
|
my @index_data; |
| 448 |
|
|
if ($cache->{index_data}->{$field}->[$page]) { |
| 449 |
|
|
|
| 450 |
|
|
@index_data = @{$cache->{index_data}->{$field}->[$page]}; |
| 451 |
|
|
} |
| 452 |
|
|
if ($x->{append}) { |
| 453 |
195 |
dpavlin |
if (@index_data) { |
| 454 |
|
|
$index_data[$#index_data] .= $idisplay; |
| 455 |
|
|
} else { |
| 456 |
|
|
push @index_data, $idisplay; |
| 457 |
|
|
} |
| 458 |
182 |
dpavlin |
} else { |
| 459 |
|
|
push @index_data, $idisplay; |
| 460 |
|
|
} |
| 461 |
|
|
$idisplay = ""; |
| 462 |
|
|
@{$cache->{index_data}->{$field}->[$page]} = @index_data; |
| 463 |
|
|
} |
| 464 |
178 |
dpavlin |
} |
| 465 |
17 |
dpavlin |
} |
| 466 |
177 |
dpavlin |
|
| 467 |
182 |
dpavlin |
if (! $iterate_by_page) { |
| 468 |
188 |
dpavlin |
my $idel = $x->{index_delimiter}; |
| 469 |
182 |
dpavlin |
# fill data in index |
| 470 |
188 |
dpavlin |
foreach my $tmp (@index_data) { |
| 471 |
|
|
my $i = $d = $tmp; |
| 472 |
|
|
if ($idel && $tmp =~ m/$idel/) { |
| 473 |
|
|
($i,$d) = split(/$idel/,$tmp); |
| 474 |
|
|
} |
| 475 |
|
|
$index->insert($field, $i, $d, $path); |
| 476 |
182 |
dpavlin |
} |
| 477 |
|
|
@index_data = (); |
| 478 |
177 |
dpavlin |
} |
| 479 |
3 |
dpavlin |
} |
| 480 |
9 |
dpavlin |
|
| 481 |
59 |
dpavlin |
# now try to parse variables from configuration file |
| 482 |
|
|
foreach my $x (@{$config->{indexer}->{$field}->{'config'}}) { |
| 483 |
13 |
dpavlin |
|
| 484 |
279 |
dpavlin |
$x = unroll_x($x); |
| 485 |
|
|
|
| 486 |
62 |
dpavlin |
my $delimiter = x($x->{delimiter}) || ' '; |
| 487 |
59 |
dpavlin |
my $val = $cfg->val($database, x($x->{content})); |
| 488 |
|
|
|
| 489 |
274 |
dpavlin |
# FIXME index_lookup is not supported! |
| 490 |
|
|
my ($s,$se,$d,$i,$il) = init_visible_type($x->{type}); |
| 491 |
59 |
dpavlin |
|
| 492 |
|
|
if ($val) { |
| 493 |
62 |
dpavlin |
$display_data .= $delimiter.$val if ($d); |
| 494 |
274 |
dpavlin |
$swish_data .= " ".$val if ($s); |
| 495 |
748 |
dpavlin |
$index->insert($field, $val, $val, $path) if ($i); |
| 496 |
59 |
dpavlin |
} |
| 497 |
|
|
|
| 498 |
178 |
dpavlin |
if ($iterate_by_page) { |
| 499 |
|
|
# FIXME data from config tag will appear just |
| 500 |
|
|
# on first page!!! |
| 501 |
|
|
my $page = 0; |
| 502 |
|
|
if ($display_data) { |
| 503 |
731 |
dpavlin |
push @{$cache->{display_data}->{$field}->[$page]}, { data => $display_data }; |
| 504 |
178 |
dpavlin |
$display_data = ""; |
| 505 |
|
|
} |
| 506 |
|
|
if ($swish_data) { |
| 507 |
731 |
dpavlin |
push @{$cache->{swish_data}->{$field}->[$page]}, { data => $swish_data }; |
| 508 |
178 |
dpavlin |
$swish_data = ""; |
| 509 |
|
|
} |
| 510 |
|
|
if ($swish_exact_data) { |
| 511 |
731 |
dpavlin |
push @{$cache->{swish_exact_data}->{$field}->[$page]}, { data => $swish_exact_data }; |
| 512 |
178 |
dpavlin |
$swish_exact_data = ""; |
| 513 |
|
|
} |
| 514 |
|
|
} |
| 515 |
59 |
dpavlin |
} |
| 516 |
|
|
|
| 517 |
178 |
dpavlin |
# save data page-by-page |
| 518 |
|
|
foreach my $field (@page_fields) { |
| 519 |
|
|
my $nr_pages = $page_max{$field} || next; |
| 520 |
|
|
#print STDERR "field '$field' iterate over ",($nr_pages || 0)," pages...\n"; |
| 521 |
|
|
#print STDERR Dumper($cache->{display_data}); |
| 522 |
731 |
dpavlin |
my $seen; # used for distinct |
| 523 |
178 |
dpavlin |
for (my $page=0; $page <= $nr_pages; $page++) { |
| 524 |
195 |
dpavlin |
my $display_data; |
| 525 |
731 |
dpavlin |
my $delimiter = ''; |
| 526 |
|
|
foreach my $element (@{ $cache->{display_data}->{$field}->[$page] }) { |
| 527 |
|
|
my $data = $element->{data}; |
| 528 |
|
|
die "BUG! no data in element?" unless ($data); |
| 529 |
|
|
|
| 530 |
|
|
if ($distinct) { |
| 531 |
|
|
|