/[bfilter]/trunk/bfilter.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /trunk/bfilter.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 35 - (hide annotations)
Sat Oct 9 21:00:06 2004 UTC (19 years, 6 months ago) by dpavlin
File MIME type: text/plain
File size: 2978 byte(s)
re-sort input data in memory to overcome locale problem with spaces

1 dpavlin 1 #!/usr/bin/perl -w
2     #
3    
4     use strict;
5     use locale;
6    
7     # maximum entries
8     my $max = 0;
9     # minimum letters to search by
10 dpavlin 29 my $min_len = shift @ARGV;
11     $min_len = 3 unless defined($min_len);
12 dpavlin 4 # if more than x elements, warn to increase min_len
13     my $increase_at = 500;
14 dpavlin 1
15 dpavlin 10 # name of generated index
16     my $headlines = 'headlines';
17    
18 dpavlin 3 my $debug = 1;
19    
20 dpavlin 1 sub print_file {
21     my $f = shift || return;
22     open(F, $f) || die "$f: $!";
23     while(<F>) {
24     print;
25     }
26     close(F);
27     }
28    
29     print qq{
30 dpavlin 10 var $headlines = new Object();
31 dpavlin 1 };
32    
33     my @part_arr;
34     my $last_part = '';
35     my $total = 0;
36    
37 dpavlin 4 my $max_elements = 0;
38    
39 dpavlin 25 sub escape_js {
40 dpavlin 26 my $t = shift || return 'undef';
41     # escape single quote and backspace
42 dpavlin 25 $t =~ s/(['\\])/\\$1/g && print STDERR "ESCAPED '$t'\n";
43 dpavlin 26 # quote string if not number
44     $t = "'$t'" unless ($t =~ m/^\d+$/);
45 dpavlin 25 return $t;
46     }
47    
48 dpavlin 35 my @lines;
49    
50 dpavlin 1 while(<STDIN>) {
51     chomp;
52    
53 dpavlin 9 if (!m/\t/ || m/\t$/) {
54     print STDERR "SKIP '$_': no tab\n";
55     next;
56     }
57    
58 dpavlin 35 # remove leading spaces (which are ignored if source list was
59     # sorted using locale)
60     s/^\s+//;
61    
62     push @lines, $_;
63     }
64    
65     # spaces will be ignored when sorting using locale. That's why we have
66     # cache of lines with spaces replaced by exclamation mark (!) so that
67     # sort order is strict and not dictionary. For more info, see:
68     # http://archives.postgresql.org/pgsql-sql/2002-04/msg00266.php
69     # http://groups.google.com/groups?selm=handler.82819.D82819.99045085113033.ackdone%40bugs.debian.org&output=gplain
70    
71     my %locale_space_fix;
72    
73     foreach (sort {
74     unless($locale_space_fix{$a}) {
75     my $tmp = $a;
76     $tmp =~ s/ /!/g;
77     $locale_space_fix{$a} = lc($tmp);
78     }
79     unless($locale_space_fix{$b}) {
80     my $tmp = $b;
81     $tmp =~ s/ /!/g;
82     $locale_space_fix{$b} = lc($tmp);
83     }
84     $locale_space_fix{$a} cmp $locale_space_fix{$b};
85     } @lines) {
86    
87 dpavlin 25 my @data = split(/\t+/,$_);
88 dpavlin 7
89 dpavlin 25 my $headline = shift @data || die "need at least headline!";
90    
91 dpavlin 9 if (length($headline) < $min_len) {
92     print STDERR "SKIP '$_': too short\n";
93     next;
94     }
95 dpavlin 1
96 dpavlin 9
97 dpavlin 1 # split into min_len part and rest
98 dpavlin 7 my ($part,$rest) = ( substr($headline,0,$min_len), substr($headline,$min_len) );
99 dpavlin 1
100     # make part lowercase
101     $part = lc($part);
102    
103     $last_part = $part if (! $last_part);
104    
105     # new part?
106     if ($part ne $last_part) {
107 dpavlin 7 print STDERR $last_part,"\t",$#part_arr+1,"\n" if ($debug && $#part_arr > $increase_at);
108 dpavlin 4 $max_elements = $#part_arr if ($#part_arr > $max_elements);
109 dpavlin 26 print "${headlines}[",escape_js($last_part),"] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
110 dpavlin 1 $total += $#part_arr;
111     @part_arr = ();
112     $last_part = $part;
113     }
114 dpavlin 26 push @part_arr, "[".escape_js($headline).",".join(",",map { escape_js($_) } @data)."]";
115 dpavlin 1
116     # break out?
117     last if ($max && $total > $max);
118     }
119    
120 dpavlin 26 print "${headlines}[",escape_js($last_part)."] = [\n ",join(",\n ",@part_arr),"];\n" if (@part_arr);
121 dpavlin 7 print qq{
122 dpavlin 4
123 dpavlin 10 ${headlines}.min_len = $min_len;
124     ${headlines}.length = $total;
125 dpavlin 7
126     };
127    
128     print STDERR "You have more than $increase_at elements, so you should\nincrease min_len to ",$min_len+1," or higher for performance benefit.\n" if ($max_elements > $increase_at);

Properties

Name Value
svn:executable

  ViewVC Help
Powered by ViewVC 1.1.26