| Revision 337 (by dpavlin, 2004/06/10 19:22:40) |
new trunk for webpac v2
|
#!/usr/bin/perl
# read files for fulltext index
# filelist is on stdin (e.g. by find)
# index entries go to stdout
# usage:
# find /foo -name \*.html | ./fulltext >/tmp/idx 2>/tmp/mst
# find /usr/share/doc -type f -a \! -name \*.htm\* | ./fulltext >/tmp/idx 2>/tmp/mst
# sort -o /tmp/idx /tmp/idx
# time ./openisis -write db/test/ft -stream -fmt mfn </tmp/mst
# time ./openisis -db db/test/ft -ifload 0 -v i </tmp/idx
# time ./openisis -db db/test/ft -ifchk -v i
# time ./openisis -db db/test/ft -search Descriptive -ifdump
$fn = 0;
while (<>) {
chomp;
$f = $_;
next unless open( F, $f );
$fn++;
print STDERR "100\t$f\n";
$line = 0;
while ( <F> ) {
next if /^\s*$/;
last if 255 < ++$line;
chomp;
$w = 0;
for $word (split /\W+/) {
next unless $word;
printf "%s\t%d\t%d\t%d\t%d\n", uc($word), $fn, 800, $line, ++$w;
}
}
print STDERR "\f\n";
}