--- trunk/mbox2index.pl 2004/05/11 15:36:51 43 +++ trunk/mbox2index.pl 2004/05/25 18:55:46 52 @@ -1,10 +1,32 @@ #!/usr/bin/perl -w BEGIN { - my $basedir = readlink($0) || $0; $basedir =~ s#/[^/]+$##; + my $basedir = readlink($0) || $0; $basedir =~ s#/[^/]+$#lib#; unshift(@INC, $basedir); } +=head1 NAME + +mbox2index.pl - indexing script for Mail::Box Web Search + +=head1 SYNOPSYS + + mbox2index.pl [local.conf] + +=head1 DESCRIPTION + +This script will index mailboxes defined in C or local +configuration file supplied at command line. + +In normal operation, using MWS::SWISH it will exec swish-e which will in +turn again call this script, but this time with C<--recursive> option. + +=head1 SEE ALSO + +C perl modules which are part of this package + +=cut + use MWS::SWISH; #use MWS::Plucene; use Data::Dumper; @@ -44,7 +66,11 @@ my $folder = $mws->open_folder($mbox); - print STDERR $folder->size," bytes\n" if ($debug); + my $total = scalar $folder->messageIds; + + print STDERR "$total messages\n" if ($debug); + + my $count = 0; foreach my $message ($folder->messages) { @@ -58,7 +84,10 @@ foreach my $direction (qw(to from cc bcc)) { foreach my $part (qw(phrase address comment)) { my @data = $mws->unroll($message,$direction,$part); - $document->{$direction.'_'.$part} = join("##", @data) if (@data); + if (@data) { + $document->{$direction.'_'.$part} = join("##", @data); + $document->{$direction.'_'.$part} =~ s/\s*\(e\s*-\s*mail\)\s*//gi; + } } } @@ -69,7 +98,7 @@ my $utime = str2time($message->date); $document->{'date_utime'} = $utime; - $document->{'date'} = strftime("%Y-%m-%d %H:%M:%S", localtime($utime)); + $document->{'date'} = strftime("%Y-%m-%d %H:%M:%S", localtime($utime)) if ($utime); # print Dumper($document); $mws->add_index("$mbox $id" => $document); @@ -83,6 +112,9 @@ # this is not complete solution. see mailbox-destruct.diff $message->destruct(); + $count++; + printf STDERR "%d messages in $mbox done [%d %%]\n",$count,($count * 100/$total) if ($count % 100 == 0); + } $mws->close_folder($mbox);