--- bin/reindex.pl 2007/02/21 03:04:48 47 +++ bin/reindex.pl 2007/04/29 11:37:28 129 @@ -7,24 +7,58 @@ use lib 'lib'; use Jifty; -use Lucene; use Grep::Search; use Data::Dump qw/dump/; +use Text::DeDuper; +use Encode; + +my $remove_duplicates = 1; + +$|=1; BEGIN { Jifty->new; }; -my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1 ); +my $system_user = Grep::CurrentUser->superuser; +my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user ); $coll->unlimit; print "indexing ", $coll->count, " items "; +my $search = Grep::Search->new(); +my $deduper = Text::DeDuper->new(); + +my ( $total, $duplicates ) = ( 0, 0 ); + while ( my $i = $coll->next ) { - Grep::Search->add( $i ); + print $i->id; + + if ( $remove_duplicates ) { + + my $c = encode('utf-8', $i->content); - print $i->id, ' '; + my @s = sort $deduper->find_similar( $c ); + if ( @s ) { + #warn " similar = ",dump( @s ); + + foreach my $id ( @s ) { + next if $id == $i->id; # keep current + my $si = Grep::Model::Item->new(); + $si->load( $id ) or die "can't find similar item $id"; + print " -$id-"; + $si->delete; + $duplicates++; + } + } + + $deduper->add_doc( $i->id, $c ); + } + + $search->add( $i, $i->in_feed->owner->id ); + print ' '; + $total++; } -print "\n"; +print "$total records indexed", $remove_duplicates ? " ($duplicates duplicates)" : "", "\n"; -Grep::Search->finish; +$search->finish;