Semantic-Engine/EPrints/index.pl

#!/usr/bin/perl -w

use strict;
use Semantic::API;
use Data::Dump qw/dump/;

use EPrints qw/_x/;

use lib '/home/dpavlin/stem-hr/';
use StemHR;

my $debug = shift @ARGV;

my $dbh = EPrints->dbh;
my $sth = $dbh->prepare(qq{
SELECT
        archive_title.eprintid as id,
        title
FROM archive_title 
}) || die $dbh->errstr();
$sth->execute() || die $sth->errstr();

my $indexer = Semantic::API::Index->new(
        storage => 'sqlite',
        database => 'eprints.db',
        collection => 'EPrints'
);


$indexer->add_word_filters( minimum_length => 3,
                            too_many_numbers => 10,
                            maximum_word_length => 15 );

# use this encoding for any incoming text
$indexer->set_default_encoding( "iso-8859-2" ); 

my $total = 0;

while (my $row = $sth->fetchrow_hashref ) {
        EPrints->id( $row->{id} );
        my ( $title, $keywords, $abstract ) = (
                _x( $row->{title} ),
                EPrints->lookup( 'keywords' ),
                EPrints->lookup( 'abstract' )
        );
        my @body = split( /\W*\s+\W*/, "$title $title $title $keywords $keywords $abstract" );
        my $body = '';
        foreach my $word ( @body ) {
                $body .= StemHR->stem( $word ) . ' ';
        }

        $body .= EPrints::slogovi( "$title $keywords $abstract" );

        warn "body: $body\n" if $debug;

        $body .= EPrints->fulltext_content;

        $indexer->index( $row->{id}, join(" ", @body, $body ) );
        $total++;
        print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";
}


print STDERR "\nNow adding $total items to the database...";
$indexer->finish(); 
print STDERR "done!\n"; 

1	#!/usr/bin/perl -w
2
3	use strict;
4	use Semantic::API;
5	use Data::Dump qw/dump/;
6
7	use EPrints qw/_x/;
8
9	use lib '/home/dpavlin/stem-hr/';
10	use StemHR;
11
12	my $debug = shift @ARGV;
13
14	my $dbh = EPrints->dbh;
15	my $sth = $dbh->prepare(qq{
16	SELECT
17	archive_title.eprintid as id,
18	title
19	FROM archive_title
20	}) \|\| die $dbh->errstr();
21	$sth->execute() \|\| die $sth->errstr();
22
23	my $indexer = Semantic::API::Index->new(
24	storage => 'sqlite',
25	database => 'eprints.db',
26	collection => 'EPrints'
27	);
28
29
30	$indexer->add_word_filters( minimum_length => 3,
31	too_many_numbers => 10,
32	maximum_word_length => 15 );
33
34	# use this encoding for any incoming text
35	$indexer->set_default_encoding( "iso-8859-2" );
36
37	my $total = 0;
38
39	while (my $row = $sth->fetchrow_hashref ) {
40	EPrints->id( $row->{id} );
41	my ( $title, $keywords, $abstract ) = (
42	_x( $row->{title} ),
43	EPrints->lookup( 'keywords' ),
44	EPrints->lookup( 'abstract' )
45	);
46	my @body = split( /\W\s+\W/, "$title $title $title $keywords $keywords $abstract" );
47	my $body = '';
48	foreach my $word ( @body ) {
49	$body .= StemHR->stem( $word ) . ' ';
50	}
51
52	$body .= EPrints::slogovi( "$title $keywords $abstract" );
53
54	warn "body: $body\n" if $debug;
55
56	$body .= EPrints->fulltext_content;
57
58	$indexer->index( $row->{id}, join(" ", @body, $body ) );
59	$total++;
60	print STDERR _x( $row->{id}, " ", $row->{title} ), "\n";
61	}
62
63
64	print STDERR "\nNow adding $total items to the database...";
65	$indexer->finish();
66	print STDERR "done!\n";
67