Revision 88 (by dpavlin, 2008/03/16 20:55:56) better error message when run without arguments
#!/usr/bin/perl -w

use strict;
use Search::Estraier 0.07;
use DBI;
use Data::Dumper;
use Encode qw/from_to/;
use Time::HiRes qw/time/;
use Getopt::Long;

=head1 NAME

pgest-index.pl - create full-text index of some columns in your database

=cut

my $c = {
	debug => 0,
};

=head1 SYNOPSIS

  pgest-index.pl --create movies --sql "select id,title,year from movies"

  pgsql-index.pl --drop movies

Options:

=over 4

=item --create name

Create index C<name> and create triggers on table with same name

=item --drop name

Remove triggers from table C<name> and node with same name

=item --node-url http://localhost:1978/node/name

Full URI to node. If it's not specified, it's assumed that you are using
Hyper Estraier on C<http://localhost:1978/>.

=item --sql "select col1,col2 from name"

SQL query which will return names of columns which are included in full-text
index. Have in mind that you can't use aliases (as I<something>) in this SQL
query (or triggers will be created with wrong fields).

If SQL query isn't specified, default one C<< select * from movies >> will
be created. That will be B<serious performance hit> if all columns are
not needed for search.

=item --pk id

Specify name of primary key column in SQL query. If you allready have primary
key on table or unique index and it consists of simgle column
(compaund keys are not supported) it will be picked up automatically.

If you specify value which is not unique, you will get just last occurence
of that item in index (which might be what you want). That's because specified
C<pk> column will be used for C<@uri> in Hyper Estraier.

If name of primary key begins with C<_> it will not be added into text
indexing (so you won't be able to find prmary key value, but it will still
be available as attribute value).

=item --user login

=item --passwd password

Username and password to use when connecting to Hyper Estraier. If not specified,
C<admin> and C<admin> will be used.

=item --debug

Dump debugging output. It may be specified multiple times for more verbose
debugging.

=back

=cut

my $usage = "$0 database_name (--create|--drop) table_name [--sql='select id,foo,bar from table'] [--pk=id]\n";

GetOptions($c, qw/create=s drop=s node_url=s sql=s pk=s user=s passwd=s debug+/);

my $dbname = shift @ARGV || die $usage;

$c->{dbi} = 'Pg:dbname=' . $dbname;

warn "# c: ", Dumper($c) if ($c->{debug});

my $table = $c->{create} || $c->{drop} || die $usage;

$c->{node_url} = 'http://localhost:1978/node/' . $table;

$c->{user} ||= 'admin';
$c->{passwd} ||= 'admin';

# create and configure node
my $node = new Search::Estraier::Node(
	url => $c->{node_url},
	user => $c->{user},
	passwd => $c->{passwd},
	croak_on_error => 1,
	create => 1,
	debug => $c->{debug} >= 4 ? 1 : 0,
);

# create DBI connection
my $dbh = DBI->connect("DBI:$c->{dbi}","","") || die $DBI::errstr;

# drop existing triggers
sub drop_triggers {
	my $table = shift || die "no table?";

	my $sth = $dbh->prepare(qq{
		SELECT relname,tgname
		FROM pg_trigger JOIN pg_class ON relfilenode = tgrelid
		WHERE tgname LIKE 'pgest_trigger_%' AND relname = ?
	}) || $dbh->errstr;

	$sth->execute( $table ) || $sth->errstr();

	warn "there are ", $sth->rows, " triggers instead of just 3, dropping all\n" if ($sth->rows != 3);

	while (my $row = $sth->fetchrow_hashref) {
		my $sql = sprintf(qq{ DROP TRIGGER %s ON %s }, $row->{tgname}, $row->{relname} );
		#warn "# $sql\n";
		$dbh->do( $sql ) || $dbh->errstr;
	}

	warn "removed ", $sth->rows, " triggers from $table\n" if ($sth->rows);

}

if ($c->{drop}) {
	drop_triggers( $table );
	warn "removing node $table\n";
	$node->master(
		action => 'nodedel',
		name => $table,
	);
	exit;
}

# clear existing node
$node->master(
	action => 'nodeclr',
	name => $table,
);

# create PostgreSQL functions
$dbh->do(qq{

CREATE OR REPLACE FUNCTION pgest(text, text, text, int, text, text, text, int, int, text[])
	RETURNS setof record
	AS 'pgest','pgest_node'
	LANGUAGE 'C' IMMUTABLE CALLED ON NULL INPUT;

CREATE OR REPLACE FUNCTION pgest_trigger() RETURNS TRIGGER
        AS 'pgest', 'pgest_trigger'
	LANGUAGE 'C' STRICT;

}) || die $dbh->errstr();


drop_triggers( $table );

if (! $c->{pk}) {

	warn "# finding primary key for $table\n" if ($c->{debug});

	my $index_fmt = qq{
		SELECT
			a.attname, t.typname
		FROM pg_type t, pg_attribute a
		WHERE t.oid = a.atttypid AND attrelid = (
			SELECT indexrelid
			FROM pg_class c, pg_index i
			WHERE c.relname = '%s'
				AND c.oid = i.indrelid
				AND %s
				AND indnatts = 1
		)
	};

	$c->{pk} = $dbh->selectrow_array( sprintf($index_fmt, $table, 'indisprimary') );
	
	$c->{pk} ||= $dbh->selectrow_array( sprintf($index_fmt, $table, 'indisunique') );

}

die "$0: can't find single column primary key for table ${table}. Please specify column with --pk\n" unless ($c->{pk});

warn "using primary key $c->{pk}", $c->{pk} =~ m/^_/ ? " (not indexed)" : "", "\n";

$dbh->begin_work;

$c->{sql} ||= "select * from $table";

my $sth = $dbh->prepare($c->{sql}) || die $dbh->errstr();
$sth->execute() || die $sth->errstr;

my @cols = @{ $sth->{NAME} };

die "SQL '$c->{sql}' didn't include primary key $c->{pk}\n" unless grep(/^\Q$c->{pk}\E$/, @cols);

warn "# columns: ",join(",", @cols),"\n" if ($c->{debug});

my $total = $sth->rows;
my $i = 1;

my $t = time();
my $pk = $c->{pk} || 'id';

warn "indexing existing ",$sth->rows," rows\n";

while (my $row = $sth->fetchrow_hashref() ) {

	warn "# row: ",Dumper($row) if ($c->{debug} >= 3);

	# create document
	my $doc = new Search::Estraier::Document;

	if (my $id = $row->{$pk}) {
		$doc->add_attr('@uri', $id);
	} else {
		die "can't find pk column '$pk' in results\n";
	}

	my $log = sprintf "%4d ",$i;

	while (my ($col,$val) = each %{$row}) {

		if ($val) {
			# add attributes (make column usable from attribute search)
			$doc->add_attr($col, $val);

			# add body text to document (make it searchable using full-text index)
			$doc->add_text($val) unless ($col =~ m/^_/);

			$log .= "R";
		} else {
			$log .= ".";
		}

	}

	warn "# doc draft: ",$doc->dump_draft, "\n" if ($c->{debug} >= 2);

	die "error: ", $node->status,"\n" unless (eval { $node->put_doc($doc) });

	$log .= sprintf(" %d%% %.1f/s\r", int(( $i++ / $total) * 100), ( $i / (time() - $t) ) );

	print STDERR $log;

}

my $cols = "'" . join("', '", @cols) . "'";

foreach my $t (qw/UPDATE INSERT DELETE/) {

	my $lc_t = lc($t);

	my $sql = qq{

		CREATE TRIGGER pgest_trigger_${lc_t} AFTER ${t}
			ON ${table} FOR EACH ROW
			EXECUTE PROCEDURE pgest_trigger('$c->{node_url}','$c->{user}','$c->{passwd}',
				'$c->{pk}', $cols
			)

	};

	#warn "$sql\n";

	$dbh->do( $sql ) || die $dbh->errstr;

}

warn "created consistency triggers\n";

$dbh->commit;

=head1 SEARCHING

At end of each run, this script will output example search SQL query on STDOUT.

You can use it to quickly construct queries for your application.

=cut

my $col_names = join(', ', @cols);
my $col_def = join(', ', map { "$_ text" } @cols);

print "
-- example SQL search query:

SELECT $col_names
FROM pgest(
	-- node, login, passwd, depth
	'$c->{node_url}', '$c->{user}', '$c->{passwd}', 0,
	-- full text search
	'foo bar',
	-- attribute filter, order, limit, offset
	null, null, null, null,
	-- return columns
	array[$cols]
) as ($col_def);

";

__END__

=head1 AUTHOR

Dobrica Pavlinusic <dpavlin@rot13.org>

L<http://www.rot13.org/~dpavlin/>

=head1 LICENSE

This product is licensed under GNU Public License (GPL) v2 or later.

=cut