12 |
use Text::DeDuper; |
use Text::DeDuper; |
13 |
use Encode; |
use Encode; |
14 |
|
|
15 |
my $remove_duplicate = 1; |
my $remove_duplicates = 1; |
16 |
|
|
17 |
$|=1; |
$|=1; |
18 |
|
|
31 |
|
|
32 |
while ( my $i = $coll->next ) { |
while ( my $i = $coll->next ) { |
33 |
|
|
34 |
my $c = encode('utf-8', $i->content); |
print $i->id; |
35 |
|
|
36 |
|
if ( $remove_duplicates ) { |
37 |
|
|
38 |
|
my $c = encode('utf-8', $i->content); |
39 |
|
|
40 |
|
my @s = sort $deduper->find_similar( $c ); |
41 |
|
if ( @s ) { |
42 |
|
#warn " similar = ",dump( @s ); |
43 |
|
|
44 |
|
foreach my $id ( @s ) { |
45 |
|
next if $id == $i->id; # keep current |
46 |
|
my $si = Grep::Model::Item->new(); |
47 |
|
$si->load( $id ) or die "can't find similar item $id"; |
48 |
|
print " -$id-"; |
49 |
|
$si->delete; |
50 |
|
$duplicates++; |
51 |
|
} |
52 |
|
} |
53 |
|
|
|
if ( $remove_duplicate && $deduper->find_similar( $c ) ) { |
|
|
$i->delete; |
|
|
print "-",$i->id,"- "; |
|
|
$duplicates++; |
|
|
} else { |
|
|
$search->add( $i, $i->in_feed->owner->id ); |
|
|
print $i->id; |
|
54 |
$deduper->add_doc( $i->id, $c ); |
$deduper->add_doc( $i->id, $c ); |
|
print ' '; |
|
55 |
} |
} |
56 |
|
|
57 |
|
$search->add( $i, $i->in_feed->owner->id ); |
58 |
|
print ' '; |
59 |
$total++; |
$total++; |
60 |
} |
} |
61 |
|
|
62 |
print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n"; |
print "$total records indexed", $remove_duplicates ? " ($duplicates duplicates)" : "", "\n"; |
63 |
|
|
64 |
$search->finish; |
$search->finish; |