--- trunk/all2xml.pl 2003/07/14 18:50:39 109 +++ trunk/all2xml.pl 2003/11/24 01:19:15 177 @@ -9,6 +9,9 @@ use Text::Iconv; use Config::IniFiles; use Encode; +#use GDBM_File; +use Fcntl; # for O_RDWR +use TDB_File; $|=1; @@ -61,6 +64,14 @@ 'feed' => 'feed' ); +my $cache; # for cacheing + +# lookup hash (tied to file) +my %lhash; +# this option will cache all lookup entries in memory. +# if you are tight on memory, turn this off +my $use_lhash_cache = 1; + sub data2xml { use xmlify; @@ -90,12 +101,24 @@ return $va <=> $vb; } - foreach my $field (sort by_order keys %{$config->{indexer}}) { + my @sorted_tags; + if ($cache->{tags_by_order}->{$type}) { + @sorted_tags = @{$cache->{tags_by_order}->{$type}}; + } else { + @sorted_tags = sort by_order keys %{$config->{indexer}}; + $cache->{tags_by_order}->{$type} = \@sorted_tags; + } + + # lookup key + my $lookup_key; + + foreach my $field (@sorted_tags) { $field=x($field); $field_usage{$field}++; my $swish_data = ""; + my $swish_exact_data = ""; my $display_data = ""; my $line_delimiter; @@ -109,10 +132,15 @@ my $repeat_off = 0; # repeatable offset - my ($s,$d,$i) = (1,1,0); # swish, display default + # swish, swish_exact, display, index, index_lookup + # swish and display defaults + my ($s,$se,$d,$i,$il) = (1,0,1,0,0); $s = 0 if (lc($x->{type}) eq "display"); $d = 0 if (lc($x->{type}) eq "swish"); + $se = 1 if (lc($x->{type}) eq "swish_exact"); ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index"); + $il = 1 if (lc($x->{type}) =~ /^lookup/); + # what will separate last line from this one? if ($display_data && $x->{append} && $x->{append} eq "1") { @@ -126,7 +154,32 @@ # placeholder for all repeatable entries for index my @index_data; - my $index_filter; + + sub mkformat { + my $x = shift || die "mkformat needs tag reference"; + my $data = shift || return; + my $format_name = x($x->{format_name}) || return $data; + my $fmt = x($config->{format}->{$format_name}->{content}) || die " is not defined!"; + my $format_delimiter = x($x->{format_delimiter}); + my @data; + if ($format_delimiter) { + @data = split(/$format_delimiter/,$data); + } else { + push @data,$data; + } + + if ($fmt) { + my $nr = scalar $fmt =~ s/%s/%s/g; + if (($#data+1) == $nr) { + return sprintf($fmt,@data); + } else { + print STDERR "mkformat: [$data] can't be split on [$format_delimiter] to $nr fields!\n"; + return $data; + } + } else { + print STDERR "usage of link '$format_name' without defined format ( tag)\n"; + } + } # while because of repeatable fields while ($swish || $display) { @@ -135,20 +188,47 @@ print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n"; last; } - + + # is this field is lookup? + if ($display && $x->{lookup}) { + if ($use_lhash_cache) { + if (!defined($cache->{lhash}->{$display})) { + my $new_display = $lhash{$display}; + if ($new_display) { +#print STDERR "lookup cache store '$display' = '$new_display'\n"; + $display = $new_display; + $cache->{lhash}->{$display} = $new_display; + } else { + print STDERR "WARNING: lookup for '$display' didn't find anything.\n"; + $display = ""; + $cache->{lhash}->{$display} = ""; + } + } else { + $display = $cache->{lhash}->{$display}; + } + } else { + $display = $lhash{$display} || ""; + } + } + # filter="name" ; filter this field through # filter/[name].pm my $filter = $x->{filter}; - if ($filter) { + if ($filter && !$cache->{filter_loaded}->{$filter}) { require "filter/".$filter.".pm"; + $cache->{filter_loaded}->{$filter}++; } # type="swish" ; field for swish - if ($s && $swish) { - if ($filter) { + if ($swish) { + if ($filter && ($s || $se)) { no strict 'refs'; - $swish_data .= join(" ",&$filter($swish)); + my $tmp = join(" ",&$filter($swish)) if ($s || $se); + $swish_data .= $tmp if ($s); + $swish_exact_data .= $tmp if ($se); + } else { - $swish_data .= $swish; + $swish_data .= $swish if ($s); + $swish_exact_data .= $swish if ($se); } } @@ -160,36 +240,55 @@ } if ($filter) { no strict 'refs'; - $display_data .= join($delimiter,&$filter($display)); + if ($display_data) { + $display_data .= $delimiter.join($delimiter,mkformat($x,&$filter($display))); + } else { + $display_data = join($delimiter,mkformat($x,&$filter($display))); + } } else { if ($display_data) { - $display_data .= $delimiter.$display; + $display_data .= $delimiter.mkformat($x,$display); } else { - $display_data .= $display; + $display_data = mkformat($x,$display); } } } # type="index" ; insert into index if ($i && $display) { - push @index_data, $display; - $index_filter = $filter if ($filter); + if ($filter) { + no strict 'refs'; + $display = &$filter($display); + } + if ($x->{append} && @index_data) { + $index_data[$#index_data].=$display; + } else { + push @index_data, $display; + } } - } - # fill data in index - if (@index_data) { - if ($index_filter) { - no strict 'refs'; - foreach my $d (&$index_filter(@index_data)) { - $index->insert($field, $d, $path); - } - } else { - foreach my $d (@index_data) { - $index->insert($field, $d, $path); + # store fields in lookup + if ($il && $display) { + if (lc($x->{type}) eq "lookup_key") { + if ($lookup_key) { + print STDERR "WARNING: try to redefine lookup_key (keys shouldn't be repeatable fields!)"; + } else { + $lookup_key = $display; + } + } elsif (lc($x->{type}) eq "lookup_val") { + if ($lookup_key) { + $lhash{$lookup_key} = $display; + } else { + print STDERR "WARNING: no lookup_key defined for '$display'?"; + } } } } + + # fill data in index + foreach my $d (@index_data) { + $index->insert($field, $d, $path); + } } # now try to parse variables from configuration file @@ -201,6 +300,8 @@ my ($s,$d,$i) = (1,1,0); # swish, display default $s = 0 if (lc($x->{type}) eq "display"); $d = 0 if (lc($x->{type}) eq "swish"); + # no support for swish exact in config. + # IMHO, it's useless ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index"); if ($val) { @@ -243,6 +344,15 @@ $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); } + if ($swish_exact_data) { + $swish_exact_data =~ s/ +/ /g; + $swish_exact_data =~ s/ +$//g; + + # add delimiters before and after word. + # That is required to produce exact match + $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); + } + } @@ -278,18 +388,39 @@ my $show_progress = $cfg_global->val('global', 'show_progress'); +my $unac_filter = $cfg_global->val('global', 'unac_filter'); +if ($unac_filter) { + require $unac_filter; +} + foreach my $database ($cfg->Sections) { my $type = lc($cfg -> val($database, 'type')) || die "$database doesn't have 'type' defined"; my $add_xml = $cfg -> val($database, 'xml'); # optional + # create new lookup file + my $lookup_file = $cfg -> val($database, 'lookup_newfile'); # optional + if ($lookup_file) { + #tie %lhash, 'GDBM_File', $lookup_file, &GDBM_NEWDB, 0644; + tie %lhash, 'TDB_File', $lookup_file, TDB_CLEAR_IF_FIRST, O_RDWR, 0644; + print STDERR "creating lookup file '$lookup_file'\n"; + } + + # open existing lookup file + $lookup_file = $cfg -> val($database, 'lookup_open'); # optional + if ($lookup_file) { + #tie %lhash, 'GDBM_File', $lookup_file, &GDBM_READER, 0644; + tie %lhash, 'TDB_File', $lookup_file, TDB_DEFAULT, O_RDWR, 0644; + print STDERR "opening lookup file '$lookup_file'\n"; + } + print STDERR "reading ./import_xml/$type.xml\n"; # extract just type basic my $type_base = $type; $type_base =~ s/_.+$//g; - $config=XMLin("./import_xml/$type.xml", forcearray => [ $type2tag{$type_base}, 'config' ], forcecontent => 1); + $config=XMLin("./import_xml/$type.xml", forcearray => [ $type2tag{$type_base}, 'config', 'format' ], forcecontent => 1); # output current progress indicator my $last_p = 0; @@ -533,6 +664,8 @@ fakeprogress($i); } + # close lookup + untie %lhash if (%lhash); } }