/[Grep]/bin/reindex.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Diff of /bin/reindex.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 110 by dpavlin, Wed Mar 14 20:02:19 2007 UTC revision 127 by dpavlin, Sun Apr 29 00:16:05 2007 UTC
# Line 7  use strict; Line 7  use strict;
7  use lib 'lib';  use lib 'lib';
8    
9  use Jifty;  use Jifty;
 use Lucene;  
10  use Grep::Search;  use Grep::Search;
11  use Data::Dump qw/dump/;  use Data::Dump qw/dump/;
12    use Text::DeDuper;
13    use Encode;
14    
15    my $remove_duplicate = 1;
16    
17    $|=1;
18    
19  BEGIN { Jifty->new; };  BEGIN { Jifty->new; };
20    
# Line 19  $coll->unlimit; Line 24  $coll->unlimit;
24  print "indexing ", $coll->count, " items ";  print "indexing ", $coll->count, " items ";
25    
26  my $search = Grep::Search->new();  my $search = Grep::Search->new();
27    my $deduper = Text::DeDuper->new();
28    
29    my ( $total, $duplicates ) = ( 0, 0 );
30    
31  while ( my $i = $coll->next ) {  while ( my $i = $coll->next ) {
32    
33          $search->add( $i, $i->in_feed->owner->id );          my $c = encode('utf-8', $i->content);
34    
35          print $i->id, ' ';          if ( $remove_duplicate && $deduper->find_similar( $c ) ) {
36                    $i->delete;
37                    print "-",$i->id,"- ";
38                    $duplicates++;
39            } else {
40                    $search->add( $i, $i->in_feed->owner->id );
41                    print $i->id;
42                    $deduper->add_doc( $i->id, $c );
43                    print ' ';
44            }
45            $total++;
46  }  }
47    
48  print "\n";  print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n";
49    
50  $search->finish;  $search->finish;

Legend:
Removed from v.110  
changed lines
  Added in v.127

  ViewVC Help
Powered by ViewVC 1.1.26