1 |
dpavlin |
47 |
#!/usr/bin/perl |
2 |
|
|
|
3 |
|
|
# helper script to re-index full text index |
4 |
|
|
|
5 |
|
|
use strict; |
6 |
|
|
|
7 |
|
|
use lib 'lib'; |
8 |
|
|
|
9 |
|
|
use Jifty; |
10 |
|
|
use Grep::Search; |
11 |
|
|
use Data::Dump qw/dump/; |
12 |
dpavlin |
127 |
use Text::DeDuper; |
13 |
|
|
use Encode; |
14 |
dpavlin |
47 |
|
15 |
dpavlin |
127 |
my $remove_duplicate = 1; |
16 |
|
|
|
17 |
|
|
$|=1; |
18 |
|
|
|
19 |
dpavlin |
47 |
BEGIN { Jifty->new; }; |
20 |
|
|
|
21 |
dpavlin |
128 |
my $system_user = Grep::CurrentUser->superuser; |
22 |
|
|
my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1, current_user => $system_user ); |
23 |
dpavlin |
47 |
$coll->unlimit; |
24 |
|
|
|
25 |
|
|
print "indexing ", $coll->count, " items "; |
26 |
|
|
|
27 |
dpavlin |
110 |
my $search = Grep::Search->new(); |
28 |
dpavlin |
127 |
my $deduper = Text::DeDuper->new(); |
29 |
dpavlin |
110 |
|
30 |
dpavlin |
127 |
my ( $total, $duplicates ) = ( 0, 0 ); |
31 |
|
|
|
32 |
dpavlin |
47 |
while ( my $i = $coll->next ) { |
33 |
|
|
|
34 |
dpavlin |
127 |
my $c = encode('utf-8', $i->content); |
35 |
dpavlin |
47 |
|
36 |
dpavlin |
127 |
if ( $remove_duplicate && $deduper->find_similar( $c ) ) { |
37 |
|
|
$i->delete; |
38 |
|
|
print "-",$i->id,"- "; |
39 |
|
|
$duplicates++; |
40 |
|
|
} else { |
41 |
|
|
$search->add( $i, $i->in_feed->owner->id ); |
42 |
|
|
print $i->id; |
43 |
|
|
$deduper->add_doc( $i->id, $c ); |
44 |
|
|
print ' '; |
45 |
|
|
} |
46 |
|
|
$total++; |
47 |
dpavlin |
47 |
} |
48 |
|
|
|
49 |
dpavlin |
127 |
print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n"; |
50 |
dpavlin |
47 |
|
51 |
dpavlin |
110 |
$search->finish; |