/[Grep]/bin/reindex.pl
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Annotation of /bin/reindex.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 127 - (hide annotations)
Sun Apr 29 00:16:05 2007 UTC (17 years ago) by dpavlin
File MIME type: text/plain
File size: 918 byte(s)
Move from Lucene (mostly because locking problems prevented fastcgi
deployment, and later haunted development server too) to KinoSearch.
For good measure added (slow) de-duplication and increased version to 0.02
1 dpavlin 47 #!/usr/bin/perl
2    
3     # helper script to re-index full text index
4    
5     use strict;
6    
7     use lib 'lib';
8    
9     use Jifty;
10     use Grep::Search;
11     use Data::Dump qw/dump/;
12 dpavlin 127 use Text::DeDuper;
13     use Encode;
14 dpavlin 47
15 dpavlin 127 my $remove_duplicate = 1;
16    
17     $|=1;
18    
19 dpavlin 47 BEGIN { Jifty->new; };
20    
21     my $coll = Grep::Model::ItemCollection->new( results_are_readable => 1 );
22     $coll->unlimit;
23    
24     print "indexing ", $coll->count, " items ";
25    
26 dpavlin 110 my $search = Grep::Search->new();
27 dpavlin 127 my $deduper = Text::DeDuper->new();
28 dpavlin 110
29 dpavlin 127 my ( $total, $duplicates ) = ( 0, 0 );
30    
31 dpavlin 47 while ( my $i = $coll->next ) {
32    
33 dpavlin 127 my $c = encode('utf-8', $i->content);
34 dpavlin 47
35 dpavlin 127 if ( $remove_duplicate && $deduper->find_similar( $c ) ) {
36     $i->delete;
37     print "-",$i->id,"- ";
38     $duplicates++;
39     } else {
40     $search->add( $i, $i->in_feed->owner->id );
41     print $i->id;
42     $deduper->add_doc( $i->id, $c );
43     print ' ';
44     }
45     $total++;
46 dpavlin 47 }
47    
48 dpavlin 127 print "$total records indexed", $remove_duplicate ? " ($duplicates duplicates)" : "", "\n";
49 dpavlin 47
50 dpavlin 110 $search->finish;

Properties

Name Value
svn:executable *

  ViewVC Help
Powered by ViewVC 1.1.26