Revision 56 (by dpavlin, 2004/12/01 22:33:19) If I have class inheritance, I might as well use it :-)
#!/usr/bin/perl -w

BEGIN {
        my $basedir = readlink($0) || $0; $basedir =~ s#/[^/]+$#/lib#;
	unshift(@INC, $basedir);
}

=head1 NAME

mbox2index.pl - indexing script for Mail::Box Web Search

=head1 SYNOPSYS

 mbox2index.pl [local.conf]

=head1 DESCRIPTION

This script will index mailboxes defined in C<global.conf> or local
configuration file supplied at command line.

In normal operation, using MWS::SWISH it will exec swish-e which will in
turn again call this script, but this time with C<--recursive> option.

=head1 SEE ALSO

C<MWS> perl modules which are part of this package

=cut

#use MWS::SWISH;
use MWS::Plucene;
use Data::Dumper;
use Date::Parse;
use POSIX qw(strftime);
use Getopt::Long;

# are we called from this script?
my $recursive = 0;

my $r = GetOptions("recursive" => \$recursive);

my $config_file = shift @ARGV || 'global.conf';

if (! -f $config_file) {
	print qq{Usage: $0 [/path/to/local.conf]

If local.conf is not specified, global.conf in current directory will
be used.
};
	exit 1;
}

#my $mws = MWS::SWISH->new(config_file => $config_file);
my $mws = MWS::Indexer->new(config_file => $config_file);

$mws->create_index if (! $recursive);

print STDERR "starting indexing...\n";

my $debug = 1;

foreach my $mbox ($mws->{config}->Parameters('folders')) {
	my $mbox_path = $mws->{config}->val('folders', $mbox);

	print STDERR "working on $mbox [$mbox_path]\n" if ($debug);

	my $folder = $mws->open_folder($mbox);

	my $total = scalar $folder->messageIds;
	
	print STDERR "$total messages\n" if ($debug);

	my $count = 0;

	foreach my $message ($folder->messages) {

		my $id = $message->messageId;

		my $document = {
			id => $id,
			folder => $mbox,
		};

		foreach my $direction (qw(to from cc bcc)) {
			foreach my $part (qw(phrase address comment)) {
				my @data = $mws->unroll($message,$direction,$part);
				if (@data) {
					$document->{$direction.'_'.$part} = join("##", @data);
					$document->{$direction.'_'.$part} =~ s/\s*\(e\s*-\s*mail\)\s*//gi;
				}
			}
		}

		$document->{'subject'} = $mws->decode_qp($message->subject) || 'no subject';

		$document->{'body'} = $mws->plain_text_body($message);

		my $utime = $message->timestamp;

		$document->{'date_utime'} = $utime;
		$document->{'date'} = strftime("%Y-%m-%d %H:%M:%S", localtime($utime)) if ($utime);

#		print Dumper($document);
		$mws->add_index("$mbox $id" => $document);

		# clear internal MWS cache to keep memory usage down
		# (this should be replaced by garbage collector in MWS,
		# but without it this is the best solution to keep machine
		# alive while indexing)
		$mws->{cache} = {};

		# this is not complete solution. see mailbox-destruct.diff
		$message->destruct();

		$count++;
		printf STDERR "%d messages in $mbox done [%d %%]\n",$count,($count * 100/$total) if ($count % 100 == 0);

	}

	$mws->close_folder($mbox);

}

$mws->close_index;