--- bin/reindex.pl 2007/02/21 03:04:48 47 +++ bin/reindex.pl 2007/04/29 00:48:04 128 @@ -7,24 +7,45 @@ use lib 'lib'; use Jifty; -use Lucene; use Grep::Search; use Data::Dump qw/dump/; +use Text::DeDuper; +use Encode; + +my $remove_duplicate = 1; + +$|=1; BEGIN { Jifty->new; }; -my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1 ); +my $system_user = Grep::CurrentUser->superuser; +my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user ); $coll->unlimit; print "indexing ", $coll->count, " items "; +my $search = Grep::Search->new(); +my $deduper = Text::DeDuper->new(); + +my ( $total, $duplicates ) = ( 0, 0 ); + while ( my $i = $coll->next ) { - Grep::Search->add( $i ); + my $c = encode('utf-8', $i->content); - print $i->id, ' '; + if ( $remove_duplicate && $deduper->find_similar( $c ) ) { + $i->delete; + print "-",$i->id,"- "; + $duplicates++; + } else { + $search->add( $i, $i->in_feed->owner->id ); + print $i->id; + $deduper->add_doc( $i->id, $c ); + print ' '; + } + $total++; } -print "\n"; +print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n"; -Grep::Search->finish; +$search->finish;