--- trunk/all2xml.pl 2003/11/24 01:19:15 177 +++ trunk/all2xml.pl 2003/11/25 20:19:03 181 @@ -102,16 +102,46 @@ } my @sorted_tags; - if ($cache->{tags_by_order}->{$type}) { - @sorted_tags = @{$cache->{tags_by_order}->{$type}}; + if ($cache->{tags_by_order}) { + @sorted_tags = @{$cache->{tags_by_order}}; } else { @sorted_tags = sort by_order keys %{$config->{indexer}}; - $cache->{tags_by_order}->{$type} = \@sorted_tags; + $cache->{tags_by_order} = \@sorted_tags; } # lookup key my $lookup_key; + # cache for field in pages + delete $cache->{display_data}; + delete $cache->{swish_data}; + delete $cache->{swish_exact_data}; + my @page_fields; # names of fields + + + # subs used to produce output + + sub get_field_name($$$) { + my ($config,$field,$field_usage) = @_; + + # find field name (signular, plural) + my $field_name = ""; + if ($config->{indexer}->{$field}->{name_singular} && $field_usage == 1) { + $field_name = $config->{indexer}->{$field}->{name_singular}; + } elsif ($config->{indexer}->{$field}->{name_plural}) { + $field_name = $config->{indexer}->{$field}->{name_plural}; + } elsif ($config->{indexer}->{$field}->{name}) { + $field_name = $config->{indexer}->{$field}->{name}; + } else { + print STDERR "WARNING: field '$field' doesn't have 'name' attribute!"; + } + if ($field_name) { + return x($field_name); + } + } + + + # begin real work: go field by field foreach my $field (@sorted_tags) { $field=x($field); @@ -125,23 +155,30 @@ my ($swish,$display); my $tag = $type2tag{$type} || die "can't find which tag to use for type $type"; + + # is this field page-by-page? + my $iterate_by_page = $config->{indexer}->{$field}->{iterate_by_page}; + push @page_fields,$field if ($iterate_by_page); + my %page_max = (); + # default line_delimiter if using + my $page_line_delimiter = $config->{indexer}->{$field}->{page_line_delimiter} || '
'; + foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) { my $format = x($x->{content}); my $delimiter = x($x->{delimiter}) || ' '; - my $repeat_off = 0; # repeatable offset + my $repeat_off = 0; # init repeatable offset # swish, swish_exact, display, index, index_lookup # swish and display defaults my ($s,$se,$d,$i,$il) = (1,0,1,0,0); $s = 0 if (lc($x->{type}) eq "display"); $d = 0 if (lc($x->{type}) eq "swish"); - $se = 1 if (lc($x->{type}) eq "swish_exact"); - ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index"); + ($s,$se,$d,$i) = (0,0,0,1) if (lc($x->{type}) eq "index"); + ($s,$se,$d,$i) = (0,1,0,0) if (lc($x->{type}) eq "swish_exact"); $il = 1 if (lc($x->{type}) =~ /^lookup/); - # what will separate last line from this one? if ($display_data && $x->{append} && $x->{append} eq "1") { $line_delimiter = ' '; @@ -183,6 +220,8 @@ # while because of repeatable fields while ($swish || $display) { + my $page = $repeat_off; + $page_max{$field} = $page if ($iterate_by_page && $page > ($page_max{$field} || 0)); ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp); if ($repeat_off > 1000) { print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n"; @@ -191,23 +230,24 @@ # is this field is lookup? if ($display && $x->{lookup}) { + my $null = ""; if ($use_lhash_cache) { if (!defined($cache->{lhash}->{$display})) { my $new_display = $lhash{$display}; - if ($new_display) { + if (defined($new_display)) { #print STDERR "lookup cache store '$display' = '$new_display'\n"; $display = $new_display; $cache->{lhash}->{$display} = $new_display; } else { print STDERR "WARNING: lookup for '$display' didn't find anything.\n"; $display = ""; - $cache->{lhash}->{$display} = ""; + $cache->{lhash}->{$display} = $null; } } else { $display = $cache->{lhash}->{$display}; } } else { - $display = $lhash{$display} || ""; + $display = $lhash{$display} || $null; } } @@ -236,7 +276,6 @@ if ($d && $display) { if ($line_delimiter && $display_data) { $display_data .= $line_delimiter; - undef $line_delimiter; } if ($filter) { no strict 'refs'; @@ -282,6 +321,36 @@ print STDERR "WARNING: no lookup_key defined for '$display'?"; } } + + } + + # store data for page-by-page repeatable fields + if ($iterate_by_page) { + sub iterate_fld($$$$$$) { + my ($cache,$what,$field,$page,$data,$append) = @_; + return if (!$data); + my $line_delimiter = $page_line_delimiter; + $line_delimiter = " " if ($append); + if (! $cache->{$what}->{$field}->[$page]) { + $cache->{$what}->{$field}->[$page] = $data; + } else { + $cache->{$what}->{$field}->[$page] .= $line_delimiter.$data; + } + } + + if ($display_data) { +print STDERR "line delimiter: ",Dumper($line_delimiter) if ($line_delimiter); + iterate_fld($cache,'display_data',$field,$page,$display_data,$x->{append}); + } + $display_data = ""; + if ($swish_data) { + iterate_fld($cache,'swish_data',$field,$page,$swish_data,$x->{append}); + $swish_data = ""; + } + if ($swish_exact_data) { + iterate_fld($cache,'swish_exact_data',$field,$page,$swish_exact_data,$x->{append}); + $swish_exact_data = ""; + } } } @@ -310,50 +379,94 @@ $index->insert($field, $val, $path) if ($i); } + if ($iterate_by_page) { + # FIXME data from config tag will appear just + # on first page!!! + my $page = 0; + if ($display_data) { + $cache->{display_data}->{$field}->[$page] = $display_data; + $display_data = ""; + } + if ($swish_data) { + $cache->{swish_data}->{$field}->[$page] = $swish_data; + $swish_data = ""; + } + if ($swish_exact_data) { + $cache->{swish_exact_data}->{$field}->[$page] = $swish_exact_data; + $swish_exact_data = ""; + } + } } + # save data page-by-page + foreach my $field (@page_fields) { + my $nr_pages = $page_max{$field} || next; +#print STDERR "field '$field' iterate over ",($nr_pages || 0)," pages...\n"; +#print STDERR Dumper($cache->{display_data}); + for (my $page=0; $page <= $nr_pages; $page++) { + + my $display_data = $cache->{display_data}->{$field}->[$page]; + if ($display_data) { # default + if ($field eq "headline") { + $xml .= xmlify("headline", $display_data); + } else { - if ($display_data) { - - if ($field eq "headline") { - $xml .= xmlify("headline", $display_data); - } else { - - # find field name (signular, plural) - my $field_name = ""; - if ($config->{indexer}->{$field}->{name_singular} && $field_usage{$field} == 1) { - $field_name = $config->{indexer}->{$field}->{name_singular}."#-#"; - } elsif ($config->{indexer}->{$field}->{name_plural}) { - $field_name = $config->{indexer}->{$field}->{name_plural}."#-#"; - } elsif ($config->{indexer}->{$field}->{name}) { - $field_name = $config->{indexer}->{$field}->{name}."#-#"; - } else { - print STDERR "WARNING: field '$field' doesn't have 'name' attribute!"; + # fallback to empty field name if needed + $html .= get_field_name($config,$field,$field_usage{$field}) || ''; + $html .= "#-#".$display_data."###\n"; + } } - if ($field_name) { - $html .= x($field_name); + + my $swish_data = $cache->{swish_data}->{$field}->[$page]; + if ($swish_data) { + # remove extra spaces + $swish_data =~ s/ +/ /g; + $swish_data =~ s/ +$//g; + + $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + } + + my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page]; + if ($swish_exact_data) { + $swish_exact_data =~ s/ +/ /g; + $swish_exact_data =~ s/ +$//g; + + # add delimiters before and after word. + # That is required to produce exact match + $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); } - $html .= $display_data."###\n"; } + } - if ($swish_data) { - # remove extra spaces - $swish_data =~ s/ +/ /g; - $swish_data =~ s/ +$//g; - - $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); - } + + if (! $iterate_by_page) { + if ($display_data) { + if ($field eq "headline") { + $xml .= xmlify("headline", $display_data); + } else { - if ($swish_exact_data) { - $swish_exact_data =~ s/ +/ /g; - $swish_exact_data =~ s/ +$//g; - - # add delimiters before and after word. - # That is required to produce exact match - $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); - } + # fallback to empty field name if needed + $html .= get_field_name($config,$field,$field_usage{$field}) || ''; + $html .= "#-#".$display_data."###\n"; + } + } + if ($swish_data) { + # remove extra spaces + $swish_data =~ s/ +/ /g; + $swish_data =~ s/ +$//g; + $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + } + if ($swish_exact_data) { + $swish_exact_data =~ s/ +/ /g; + $swish_exact_data =~ s/ +$//g; + + # add delimiters before and after word. + # That is required to produce exact match + $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); + } + } } # dump formatted output in @@ -452,6 +565,9 @@ # now read database print STDERR "using: $type...\n"; + # erase cache for tags by order in this database + delete $cache->{tags_by_order}; + if ($type_base eq "isis") { my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";