| Revision 36 (by dpavlin, 2004/10/10 08:30:36) |
removed locale fix and locale alltogether -- JavaScript hasn't notion of
locale, so I must sort without locale
|
#!/usr/bin/perl -w
#
use strict;
# maximum entries
my $max = 0;
# minimum letters to search by
my $min_len = shift @ARGV;
$min_len = 3 unless defined($min_len);
# if more than x elements, warn to increase min_len
my $increase_at = 500;
# name of generated index
my $headlines = 'headlines';
my $debug = 1;
sub print_file {
my $f = shift || return;
open(F, $f) || die "$f: $!";
while(<F>) {
print;
}
close(F);
}
print qq{
var $headlines = new Object();
};
my @part_arr;
my $last_part = '';
my $total = 0;
my $max_elements = 0;
sub escape_js {
my $t = shift || return 'undef';
# escape single quote and backspace
$t =~ s/(['\\])/\\$1/g && print STDERR "ESCAPED '$t'\n";
# quote string if not number
$t = "'$t'" unless ($t =~ m/^\d+$/);
return $t;
}
my @lines;
while(<STDIN>) {
chomp;
if (!m/\t/ || m/\t$/) {
print STDERR "SKIP '$_': no tab\n";
next;
}
# remove leading spaces (which are ignored if source list was
# sorted using locale)
s/^\s+//;
push @lines, $_;
}
foreach (sort { lc($a) cmp lc($b) } @lines) {
my @data = split(/\t+/,$_);
my $headline = shift @data || die "need at least headline!";
if (length($headline) < $min_len) {
print STDERR "SKIP '$_': too short\n";
next;
}
# split into min_len part and rest
my ($part,$rest) = ( substr($headline,0,$min_len), substr($headline,$min_len) );
# make part lowercase
$part = lc($part);
$last_part = $part if (! $last_part);
# new part?
if ($part ne $last_part) {
print STDERR $last_part,"\t",$#part_arr+1,"\n" if ($debug && $#part_arr > $increase_at);
$max_elements = $#part_arr if ($#part_arr > $max_elements);
print "${headlines}[",escape_js($last_part),"] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
$total += $#part_arr;
@part_arr = ();
$last_part = $part;
}
push @part_arr, "[".escape_js($headline).",".join(",",map { escape_js($_) } @data)."]";
# break out?
last if ($max && $total > $max);
}
print "${headlines}[",escape_js($last_part)."] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
print qq{
${headlines}.min_len = $min_len;
${headlines}.length = $total;
};
print STDERR "You have more than $increase_at elements, so you should\nincrease min_len to ",$min_len+1," or higher for performance benefit.\n" if ($max_elements > $increase_at);