--- trunk/all2xml.pl 2003/11/24 01:19:15 177 +++ trunk/all2xml.pl 2003/12/15 00:12:16 196 @@ -72,6 +72,8 @@ # if you are tight on memory, turn this off my $use_lhash_cache = 1; +my $last_field_name; # cache to prevent repeated fields + sub data2xml { use xmlify; @@ -102,16 +104,54 @@ } my @sorted_tags; - if ($cache->{tags_by_order}->{$type}) { - @sorted_tags = @{$cache->{tags_by_order}->{$type}}; + if ($cache->{tags_by_order}) { + @sorted_tags = @{$cache->{tags_by_order}}; } else { @sorted_tags = sort by_order keys %{$config->{indexer}}; - $cache->{tags_by_order}->{$type} = \@sorted_tags; + $cache->{tags_by_order} = \@sorted_tags; } # lookup key my $lookup_key; + # cache for field in pages + delete $cache->{display_data}; + delete $cache->{swish_data}; + delete $cache->{swish_exact_data}; + delete $cache->{index_data}; + delete $cache->{index_delimiter}; + my @page_fields; # names of fields + + + # subs used to produce output + + sub get_field_name($$$) { + my ($config,$field,$field_usage) = @_; + + # find field name (signular, plural) + my $field_name = ""; + if ($config->{indexer}->{$field}->{name_singular} && $field_usage == 1) { + $field_name = $config->{indexer}->{$field}->{name_singular}; + } elsif ($config->{indexer}->{$field}->{name_plural}) { + $field_name = $config->{indexer}->{$field}->{name_plural}; + } elsif ($config->{indexer}->{$field}->{name}) { + $field_name = $config->{indexer}->{$field}->{name}; + } else { + print STDERR "WARNING: field '$field' doesn't have 'name' attribute!"; + } + if ($field_name) { + if (! $last_field_name) { + $last_field_name = x($field_name); + return $last_field_name; + } elsif ($field_name ne $last_field_name) { + $last_field_name = x($field_name); + return $last_field_name; + } + } + } + + + # begin real work: go field by field foreach my $field (@sorted_tags) { $field=x($field); @@ -120,30 +160,46 @@ my $swish_data = ""; my $swish_exact_data = ""; my $display_data = ""; + my @index_data; my $line_delimiter; my ($swish,$display); my $tag = $type2tag{$type} || die "can't find which tag to use for type $type"; + + # is this field page-by-page? + my $iterate_by_page = $config->{indexer}->{$field}->{iterate_by_page}; + push @page_fields,$field if ($iterate_by_page); + my %page_max = (); + # default line_delimiter if using + my $page_line_delimiter = $config->{indexer}->{$field}->{page_line_delimiter} || '
'; + $cache->{index_delimiter}->{$field} = $config->{indexer}->{$field}->{index_delimiter}; + + my $format_name = $config->{indexer}->{$field}->{format_name}; + my $format_delimiter = $config->{indexer}->{$field}->{format_delimiter}; + if ($format_name && $format_delimiter) { + $cache->{format}->{$field}->{format_name} = $format_name; + $cache->{format}->{$field}->{format_delimiter} = $format_delimiter; + } + foreach my $x (@{$config->{indexer}->{$field}->{$tag}}) { my $format = x($x->{content}); my $delimiter = x($x->{delimiter}) || ' '; - my $repeat_off = 0; # repeatable offset + my $repeat_off = 0; # init repeatable offset # swish, swish_exact, display, index, index_lookup # swish and display defaults my ($s,$se,$d,$i,$il) = (1,0,1,0,0); $s = 0 if (lc($x->{type}) eq "display"); $d = 0 if (lc($x->{type}) eq "swish"); - $se = 1 if (lc($x->{type}) eq "swish_exact"); - ($s,$d,$i) = (0,0,1) if (lc($x->{type}) eq "index"); - $il = 1 if (lc($x->{type}) =~ /^lookup/); - + ($s,$se,$d,$i) = (0,1,0,1) if (lc($x->{type}) eq "index"); + ($s,$se,$d,$i) = (0,1,0,0) if (lc($x->{type}) eq "swish_exact"); + ($s,$se,$d,$i,$il) = (0,1,0,0,1) if (lc($x->{type}) =~ /^lookup/); # what will separate last line from this one? - if ($display_data && $x->{append} && $x->{append} eq "1") { + if ($display_data && $x->{append}) { $line_delimiter = ' '; } elsif ($display_data) { $line_delimiter = '
'; @@ -153,9 +209,8 @@ ($swish,$display) = (1,1); # placeholder for all repeatable entries for index - my @index_data; - sub mkformat { + sub mkformat($$) { my $x = shift || die "mkformat needs tag reference"; my $data = shift || return; my $format_name = x($x->{format_name}) || return $data; @@ -183,6 +238,8 @@ # while because of repeatable fields while ($swish || $display) { + my $page = $repeat_off; + $page_max{$field} = $page if ($iterate_by_page && $page > ($page_max{$field} || 0)); ($swish,$display) = parse_format($type, $format,$row,$repeat_off++,$import2cp); if ($repeat_off > 1000) { print STDERR "loop (more than 1000 repeatable fields) deteced in $row, $format\n"; @@ -191,23 +248,24 @@ # is this field is lookup? if ($display && $x->{lookup}) { + my $null = ""; if ($use_lhash_cache) { if (!defined($cache->{lhash}->{$display})) { my $new_display = $lhash{$display}; - if ($new_display) { + if (defined($new_display)) { #print STDERR "lookup cache store '$display' = '$new_display'\n"; $display = $new_display; $cache->{lhash}->{$display} = $new_display; } else { print STDERR "WARNING: lookup for '$display' didn't find anything.\n"; $display = ""; - $cache->{lhash}->{$display} = ""; + $cache->{lhash}->{$display} = $null; } } else { $display = $cache->{lhash}->{$display}; } } else { - $display = $lhash{$display} || ""; + $display = $lhash{$display} || $null; } } @@ -224,11 +282,23 @@ no strict 'refs'; my $tmp = join(" ",&$filter($swish)) if ($s || $se); $swish_data .= $tmp if ($s); - $swish_exact_data .= $tmp if ($se); + if ($se) { + if ($swish_exact_data) { + $swish_exact_data .= "xxexx xxbxx ".$tmp; + } else { + $swish_exact_data .= $tmp; + } + } } else { $swish_data .= $swish if ($s); - $swish_exact_data .= $swish if ($se); + if ($se) { + if ($swish_exact_data) { + $swish_exact_data .= "xxexx xxbxx ".$swish; + } else { + $swish_exact_data .= $swish; + } + } } } @@ -236,7 +306,6 @@ if ($d && $display) { if ($line_delimiter && $display_data) { $display_data .= $line_delimiter; - undef $line_delimiter; } if ($filter) { no strict 'refs'; @@ -255,16 +324,14 @@ } # type="index" ; insert into index + my $idisplay; if ($i && $display) { + $idisplay = $display; if ($filter) { no strict 'refs'; - $display = &$filter($display); - } - if ($x->{append} && @index_data) { - $index_data[$#index_data].=$display; - } else { - push @index_data, $display; + $idisplay = &$filter($idisplay); } + push @index_data, $idisplay if (! $iterate_by_page); } # store fields in lookup @@ -282,12 +349,71 @@ print STDERR "WARNING: no lookup_key defined for '$display'?"; } } + + } + + # store data for page-by-page repeatable fields + if ($iterate_by_page) { + sub iterate_fld($$$$$$) { + my ($cache,$what,$field,$page,$data,$append) = @_; + return if (!$data); + + my $ldel = $page_line_delimiter; + $ldel = " " if ($append); +#print STDERR "line delimiter: ",Dumper($ldel) if ($ldel); + if (! $cache->{$what}->{$field}->[$page]) { + $cache->{$what}->{$field}->[$page] = $data; + } else { + $cache->{$what}->{$field}->[$page] .= $ldel.$data; + } + } + + if ($display_data) { + iterate_fld($cache,'display_data',$field,$page,$display_data,$x->{append}); + } + $display_data = ""; + if ($swish_data) { + iterate_fld($cache,'swish_data',$field,$page,$swish_data,$x->{append}); + $swish_data = ""; + } + if ($swish_exact_data) { + iterate_fld($cache,'swish_exact_data',$field,$page,$swish_exact_data,$x->{append}); + $swish_exact_data = ""; + } + + if ($idisplay) { + my $ldel=$page_line_delimiter; + my @index_data; + if ($cache->{index_data}->{$field}->[$page]) { + + @index_data = @{$cache->{index_data}->{$field}->[$page]}; + } + if ($x->{append}) { + if (@index_data) { + $index_data[$#index_data] .= $idisplay; + } else { + push @index_data, $idisplay; + } + } else { + push @index_data, $idisplay; + } + $idisplay = ""; + @{$cache->{index_data}->{$field}->[$page]} = @index_data; + } } } - # fill data in index - foreach my $d (@index_data) { - $index->insert($field, $d, $path); + if (! $iterate_by_page) { + my $idel = $x->{index_delimiter}; + # fill data in index + foreach my $tmp (@index_data) { + my $i = $d = $tmp; + if ($idel && $tmp =~ m/$idel/) { + ($i,$d) = split(/$idel/,$tmp); + } + $index->insert($field, $i, $d, $path); + } + @index_data = (); } } @@ -310,50 +436,109 @@ $index->insert($field, $val, $path) if ($i); } + if ($iterate_by_page) { + # FIXME data from config tag will appear just + # on first page!!! + my $page = 0; + if ($display_data) { + $cache->{display_data}->{$field}->[$page] = $display_data; + $display_data = ""; + } + if ($swish_data) { + $cache->{swish_data}->{$field}->[$page] = $swish_data; + $swish_data = ""; + } + if ($swish_exact_data) { + $cache->{swish_exact_data}->{$field}->[$page] = $swish_exact_data; + $swish_exact_data = ""; + } + } } - - if ($display_data) { - - if ($field eq "headline") { - $xml .= xmlify("headline", $display_data); - } else { - - # find field name (signular, plural) - my $field_name = ""; - if ($config->{indexer}->{$field}->{name_singular} && $field_usage{$field} == 1) { - $field_name = $config->{indexer}->{$field}->{name_singular}."#-#"; - } elsif ($config->{indexer}->{$field}->{name_plural}) { - $field_name = $config->{indexer}->{$field}->{name_plural}."#-#"; - } elsif ($config->{indexer}->{$field}->{name}) { - $field_name = $config->{indexer}->{$field}->{name}."#-#"; + # save data page-by-page + foreach my $field (@page_fields) { + my $nr_pages = $page_max{$field} || next; +#print STDERR "field '$field' iterate over ",($nr_pages || 0)," pages...\n"; +#print STDERR Dumper($cache->{display_data}); + for (my $page=0; $page <= $nr_pages; $page++) { + my $display_data; + if ($cache->{format}->{$field}) { + $display_data=mkformat($cache->{format}->{$field},$cache->{display_data}->{$field}->[$page]); } else { - print STDERR "WARNING: field '$field' doesn't have 'name' attribute!"; + $display_data = $cache->{display_data}->{$field}->[$page]; + } + if ($display_data) { # default + if ($field eq "headline") { + $xml .= xmlify("headline", $display_data); + } else { + + # fallback to empty field name if needed + $html .= get_field_name($config,$field,$field_usage{$field}) || ''; + $html .= "#-#".$display_data."###\n"; + } } - if ($field_name) { - $html .= x($field_name); + + my $swish_data = $cache->{swish_data}->{$field}->[$page]; + if ($swish_data) { + # remove extra spaces + $swish_data =~ s/ +/ /g; + $swish_data =~ s/ +$//g; + + $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + } + + my $swish_exact_data = $cache->{swish_exact_data}->{$field}->[$page]; + if ($swish_exact_data) { + $swish_exact_data =~ s/ +/ /g; + $swish_exact_data =~ s/ +$//g; + + # add delimiters before and after word. + # That is required to produce exact match + $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); + } + + my $idel = $cache->{index_delimiter}->{$field}; + foreach my $tmp (@{$cache->{index_data}->{$field}->[$page]}) { + my $i = $tmp; + my $d = $tmp; + if ($idel && $tmp =~ m/$idel/) { + ($i,$d) = split(/$idel/,$tmp); + } + $index->insert($field, $i, $d, $path); +#print STDERR "index [$idel] $field: $i --> $d [$path]\n"; } - $html .= $display_data."###\n"; } + } - if ($swish_data) { - # remove extra spaces - $swish_data =~ s/ +/ /g; - $swish_data =~ s/ +$//g; - - $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); - } + + if (! $iterate_by_page) { + if ($display_data) { + if ($field eq "headline") { + $xml .= xmlify("headline", $display_data); + } else { - if ($swish_exact_data) { - $swish_exact_data =~ s/ +/ /g; - $swish_exact_data =~ s/ +$//g; - - # add delimiters before and after word. - # That is required to produce exact match - $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); - } + # fallback to empty field name if needed + $html .= get_field_name($config,$field,$field_usage{$field}) || ''; + $html .= "#-#".$display_data."###\n"; + } + } + if ($swish_data) { + # remove extra spaces + $swish_data =~ s/ +/ /g; + $swish_data =~ s/ +$//g; + $xml .= xmlify($field."_swish", unac_string($codepage,$swish_data)); + } + if ($swish_exact_data) { + $swish_exact_data =~ s/ +/ /g; + $swish_exact_data =~ s/ +$//g; + + # add delimiters before and after word. + # That is required to produce exact match + $xml .= xmlify($field."_swish_exact", unac_string($codepage,'xxbxx '.$swish_exact_data.' xxexx')); + } + } } # dump formatted output in @@ -452,6 +637,9 @@ # now read database print STDERR "using: $type...\n"; + # erase cache for tags by order in this database + delete $cache->{tags_by_order}; + if ($type_base eq "isis") { my $isis_db = $cfg -> val($database, 'isis_db') || die "$database doesn't have 'isis_db' defined!";